TABLE OF CONTENTS
- ABINIT/m_fftw3
- m_fftw3/cplan_many_dft
- m_fftw3/dplan_many_dft_1D
- m_fftw3/dplan_many_dft_2D
- m_fftw3/dplan_many_dft_c2r
- m_fftw3/dplan_many_dft_r2c
- m_fftw3/fftw3_accrho
- m_fftw3/fftw3_alloc_complex1d_dpc
- m_fftw3/fftw3_alloc_complex1d_spc
- m_fftw3/fftw3_alloc_real1d_dp
- m_fftw3/fftw3_alloc_real2d_dp
- m_fftw3/fftw3_applypot
- m_fftw3/fftw3_applypot_many
- m_fftw3/fftw3_c2c_ip_dpc
- m_fftw3/fftw3_c2c_ip_spc
- m_fftw3/fftw3_c2c_op_dpc
- m_fftw3/fftw3_c2c_op_spc
- m_fftw3/fftw3_c2r_op
- m_fftw3/fftw3_cleanup
- m_fftw3/fftw3_destroy_plan
- m_fftw3/fftw3_execute_dft_dp
- m_fftw3/fftw3_execute_dft_dpc
- m_fftw3/fftw3_execute_dft_spc
- m_fftw3/fftw3_fftpad_dp
- m_fftw3/fftw3_fftpad_dpc
- m_fftw3/fftw3_fftpad_spc
- m_fftw3/fftw3_fftrisc_dp
- m_fftw3/fftw3_fftrisc_mixprec
- m_fftw3/fftw3_fftrisc_sp
- m_fftw3/fftw3_fftug_dp
- m_fftw3/fftw3_fftug_dpc
- m_fftw3/fftw3_fftug_spc
- m_fftw3/fftw3_fftur_dp
- m_fftw3/fftw3_fftur_dpc
- m_fftw3/fftw3_fftur_spc
- m_fftw3/fftw3_init_threads
- m_fftw3/fftw3_many_dft_ip
- m_fftw3/fftw3_many_dft_op
- m_fftw3/fftw3_mpiback
- m_fftw3/fftw3_mpiback_manywf
- m_fftw3/fftw3_mpiback_wf
- m_fftw3/fftw3_mpiforw
- m_fftw3/fftw3_mpiforw_manywf
- m_fftw3/fftw3_mpiforw_wf
- m_fftw3/fftw3_mpifourdp_c2c
- m_fftw3/fftw3_mpifourdp_c2r
- m_fftw3/fftw3_mpifourdp_r2c
- m_fftw3/fftw3_plan3_t
- m_fftw3/fftw3_poisson
- m_fftw3/fftw3_r2c_op
- m_fftw3/fftw3_seqfourdp
- m_fftw3/fftw3_seqfourwf
- m_fftw3/fftw3_set_nthreads
- m_fftw3/fftw3_spawn_threads_here
- m_fftw3/fftw3_use_lib_threads
- m_fftw3/fftw3mpi_many_dft_ip
- m_fftw3/fftw3mpi_many_dft_tr
- m_fftw3/fftwmpi_free_work_array
- m_fftw3/fftwmpi_get_work_array
- m_fftw3/old_fftw3_mpifourdp
- m_fftw3/zplan_many_dft
- m_m_fftw3/fftw3_mpifourdp
ABINIT/m_fftw3 [ Modules ]
NAME
m_fftw3
FUNCTION
This module provides wrappers for the FFTW3 routines: in-place and out-of-place version.
COPYRIGHT
Copyright (C) 2009-2024 ABINIT group (MG, FD) This file is distributed under the terms of the GNU General Public License, see ~abinit/COPYING or http://www.gnu.org/copyleft/gpl.txt .
NOTES
1) MPI parallelism is in testing stage 2) For better performance the FFT divisions should contain small factors [2, 3, 5, 7, 11]
SOURCE
20 #if defined HAVE_CONFIG_H 21 #include "config.h" 22 #endif 23 24 #include "abi_common.h" 25 26 ! It seems that MKL wrappers do not like the advanced interfaces for 27 ! r2c and c2r transforms although they work fine if the true FFTW3 library is used. 28 !#define DEV_RC_BUG 29 #undef DEV_RC_BUG 30 31 #define FFTLIB "FFTW3" 32 #define FFT_PREF(name) CONCAT(fftw3_,name) 33 #define SPAWN_THREADS_HERE(ndat, nthreads) fftw3_spawn_threads_here(ndat, nthreads) 34 #define FFT_DOUBLE 1 35 #define FFT_SINGLE 2 36 #define FFT_MIXPREC 3 37 38 MODULE m_fftw3 39 40 use defs_basis 41 use m_abicore 42 use m_errors 43 use m_xomp 44 use m_xmpi 45 use m_hide_blas 46 use m_cgtools 47 use m_cplxtools 48 use m_distribfft 49 use m_fftcore 50 use, intrinsic :: iso_c_binding 51 52 use m_time, only : timab 53 use m_numeric_tools, only : imax_loc 54 use defs_abitypes, only : MPI_type 55 use m_mpinfo, only : ptabs_fourwf 56 use m_fstrings, only : strcat, itoa, sjoin 57 use m_fft_mesh, only : zpad_t, zpad_init, zpad_free 58 59 implicit none 60 61 #ifdef HAVE_FFTW3_MPI 62 include 'fftw3-mpi.f03' 63 #endif 64 65 !This should be done but MKL fftw hasn't always this include file 66 !#ifdef HAVE_FFT_FFTW3 67 ! include 'fftw3.f03' 68 !#endif 69 70 private 71 72 ! Entry points for client code 73 public :: fftw3_seqfourdp ! 3D FFT of lengths nx, ny, nz. Mainly used for densities or potentials. 74 public :: fftw3_seqfourwf ! FFT transform of wavefunctions (high-level interface). 75 public :: fftw3_fftrisc 76 public :: fftw3_fftrisc_mixprec ! Mixed precision version of fftrisc: input/output in dp, computation done in sp. 77 public :: fftw3_fftug ! G-->R. 3D zero-padded FFT of lengths nx, ny, nz. Mainly used for wavefunctions 78 public :: fftw3_fftur ! R-->G, 3D zero-padded FFT of lengths nx, ny, nz. Mainly used for wavefunctions 79 public :: fftw3_use_lib_threads 80 public :: fftw3_spawn_threads_here 81 82 public :: fftw3_mpifourdp 83 84 ! Low-level routines. 85 public :: fftw3_cleanup ! Reset FFTW to the pristine state it was in when you started your program, 86 public :: fftw3_init_threads ! one-time initialization required to use FFTW3 threads. 87 public :: fftw3_set_nthreads ! Set the number of threads you want FFTW3 to use when HAVE_FFT_FFTW3_THREADS is defined. 88 public :: fftw3_r2c_op ! Real to complex transform (out-of-place version). 89 public :: fftw3_c2r_op ! Complex to real transform (out-of-place version). 90 public :: fftw3_c2c_op ! complex to complex transform (out-of-place version). 91 public :: fftw3_c2c_ip ! complex to complex transform (in-place version). 92 public :: fftw3_many_dft_op ! Driver routine for many out-of-place 3D complex-to-complex FFTs. 93 public :: fftw3_many_dft_ip ! Driver routine for many in-place 3D complex-to-complex FFTs. 94 public :: fftw3_fftpad ! Driver routines for zero-padded FFT of wavefunctions. 95 public :: fftw3_fftpad_dp ! Driver routines for zero-padded FFT of wavefunctions. 96 public :: fftw3_fftug_dp ! Driver routines for zero-padded FFT of wavefunctions. 97 public :: fftw3_poisson ! Solve the poisson equation in G-space starting from n(r). 98 99 ! MPI version 100 public :: fftw3_mpiback_wf 101 public :: fftw3_mpiback_manywf 102 public :: fftw3_mpiforw_wf 103 public :: fftw3_mpiforw_manywf 104 public :: fftw3_mpiback 105 public :: fftw3_mpiforw 106 public :: fftw3_applypot 107 public :: fftw3_applypot_many 108 public :: fftw3_accrho 109 110 #ifdef HAVE_FFTW3_MPI 111 ! flags copied from fftw3.f 112 integer,public,parameter :: ABI_FFTW_FORWARD = FFTW_FORWARD 113 integer,public,parameter :: ABI_FFTW_BACKWARD = FFTW_BACKWARD 114 integer,public,parameter :: ABI_FFTW_ESTIMATE = FFTW_ESTIMATE 115 integer,public,parameter :: ABI_FFTW_MEASURE = FFTW_MEASURE 116 ! end flags copied from fftw3.f 117 integer,public,parameter :: ABI_FFTW_MPI_TRANSPOSED_IN = FFTW_MPI_TRANSPOSED_IN 118 integer,public,parameter :: ABI_FFTW_MPI_TRANSPOSED_OUT = FFTW_MPI_TRANSPOSED_OUT 119 ! end flags copies from fftw3-mpi.f03 120 #else 121 integer,public,parameter :: ABI_FFTW_FORWARD = -1 122 integer,public,parameter :: ABI_FFTW_BACKWARD = +1 123 integer,public,parameter :: ABI_FFTW_ESTIMATE = 64 124 integer,public,parameter :: ABI_FFTW_MEASURE = 0 125 ! end flags copied from fftw3.f 126 integer,public,parameter :: ABI_FFTW_MPI_TRANSPOSED_IN = 536870912 127 integer,public,parameter :: ABI_FFTW_MPI_TRANSPOSED_OUT = 1073741824 128 ! end flags copies from fftw3-mpi.f03 129 #endif 130 131 ! ========================================================================================== 132 ! ==== Variables introduced for the FFTW3 interface in abinit. Not belonging to fftw3.f ==== 133 ! ========================================================================================== 134 135 integer,public,parameter :: NULL_PLAN = 0 136 ! MKL wrappers might return NULL_PLAN if a particular FFTW3 feature is not available 137 138 integer,public,parameter :: KIND_FFTW_PLAN = 8 139 ! It should be at least integer*@SIZEOF_INT_P@ 140 ! MKL wrappers requires it to be integer*8, so do _not_ use C_INTPTR_T. 141 142 #ifdef HAVE_FFTW3_THREADS 143 integer,private,save :: THREADS_INITED = 0 144 ! 1 if treads have been initialized. 0 otherwise. 145 #endif 146 147 logical,private,save :: USE_LIB_THREADS = .FALSE.
m_fftw3/cplan_many_dft [ Functions ]
[ Top ] [ m_fftw3 ] [ Functions ]
NAME
FUNCTION
INPUTS
SIDE EFFECTS
SOURCE
2611 !! FIXME technically it should be intent(inout) since FFTW3 can destroy the input for particular flags. 2612 2613 function cplan_many_dft(rank,n,howmany,fin,inembed,istride,idist,fout,onembed,ostride,odist,sign,flags,nthreads) result(plan) 2614 2615 !Arguments ------------------------------------ 2616 !scalars 2617 integer,intent(in) :: rank,howmany,istride,ostride, sign,flags,idist,odist,nthreads 2618 integer,intent(in) :: n(rank),inembed(rank),onembed(rank) 2619 integer(KIND_FFTW_PLAN) :: plan 2620 !arrays 2621 complex(spc) :: fin(*),fout(*) 2622 2623 !Local variables------------------------------- 2624 character(len=500) :: msg,frmt 2625 2626 ! ************************************************************************* 2627 2628 !$OMP CRITICAL (OMPC_cplan_many_dft) 2629 call fftw3_set_nthreads(nthreads) 2630 2631 call sfftw_plan_many_dft(plan, rank, n, howmany, & 2632 & fin, inembed, istride, idist, fout, onembed, ostride, odist, sign, flags) 2633 !$OMP END CRITICAL (OMPC_cplan_many_dft) 2634 2635 if (plan==NULL_PLAN) then ! handle the error 2636 call wrtout(std_out, "sfftw_plan_many_dft returned NULL_PLAN (complex version)") 2637 write(frmt,*)"(a,",rank,"(1x,i0),3(a,i0),a,2(a,",rank,"(1x,i0),2(a,i0),a))" 2638 write(msg,frmt)& 2639 & " n = ",n," howmany = ",howmany," sign = ",sign," flags = ",flags,ch10,& 2640 & " inembed = ",inembed," istride = ",istride," idist =",idist,ch10, & 2641 & " onembed = ",onembed," ostride = ",ostride," odist =",idist,ch10 2642 call wrtout(std_out, msg) 2643 ABI_ERROR("Check FFTW library and/or abinit code") 2644 end if 2645 2646 end function cplan_many_dft
m_fftw3/dplan_many_dft_1D [ Functions ]
[ Top ] [ m_fftw3 ] [ Functions ]
NAME
FUNCTION
INPUTS
SIDE EFFECTS
SOURCE
2514 function dplan_many_dft_1D(rank,n,howmany,fin,inembed,istride,idist,fout,onembed,ostride,odist,sign,flags,nthreads) result(plan) 2515 2516 !Arguments ------------------------------------ 2517 !scalars 2518 integer,intent(in) :: rank,howmany,istride,ostride, sign,flags,idist,odist,nthreads 2519 integer,intent(in) :: n(rank),inembed(rank),onembed(rank) 2520 integer(KIND_FFTW_PLAN) :: plan 2521 !arrays 2522 real(dp) :: fin(*),fout(*) 2523 2524 !Local variables------------------------------- 2525 character(len=500) :: msg,frmt 2526 2527 ! ************************************************************************* 2528 2529 !$OMP CRITICAL (OMPC_dfftw_plan_many_dft_1D) 2530 call fftw3_set_nthreads(nthreads) 2531 2532 call dfftw_plan_many_dft(plan, rank, n, howmany, & 2533 & fin, inembed, istride, idist, fout, onembed, ostride, odist, sign, flags) 2534 !$OMP END CRITICAL (OMPC_dfftw_plan_many_dft_1D) 2535 2536 if (plan==NULL_PLAN) then 2537 call wrtout(std_out, "dfftw_plan_many_dft returned NULL_PLAN!") 2538 write(frmt,*)"(a,",rank,"(1x,i0),3(a,i0),a,2(a,",rank,"(1x,i0),2(a,i0),a))" 2539 write(msg,frmt)& 2540 & " n= ",n," howmany= ",howmany," sign= ",sign," flags= ",flags,ch10,& 2541 & " inembed= ",inembed," istride= ",istride," idist=",idist,ch10, & 2542 & " onembed= ",onembed," ostride= ",ostride," odist=",idist,ch10 2543 call wrtout(std_out, msg) 2544 ABI_ERROR("Check FFTW library and/or abinit code") 2545 end if 2546 2547 end function dplan_many_dft_1D
m_fftw3/dplan_many_dft_2D [ Functions ]
[ Top ] [ m_fftw3 ] [ Functions ]
NAME
FUNCTION
INPUTS
SIDE EFFECTS
SOURCE
2563 function dplan_many_dft_2D(rank,n,howmany,fin,inembed,istride,idist,fout,onembed,ostride,odist,sign,flags,nthreads) result(plan) 2564 2565 !Arguments ------------------------------------ 2566 !scalars 2567 integer,intent(in) :: rank,howmany,istride,ostride, sign,flags,idist,odist,nthreads 2568 integer,intent(in) :: n(rank),inembed(rank),onembed(rank) 2569 integer(KIND_FFTW_PLAN) :: plan 2570 !arrays 2571 real(dp) :: fin(2,*),fout(2,*) 2572 2573 !Local variables------------------------------- 2574 character(len=500) :: msg,frmt 2575 2576 ! ************************************************************************* 2577 2578 !$OMP CRITICAL (OMPC_dfftw_plan_many_dft_2D) 2579 call fftw3_set_nthreads(nthreads) 2580 2581 call dfftw_plan_many_dft(plan, rank, n, howmany, & 2582 & fin, inembed, istride, idist, fout, onembed, ostride, odist, sign, flags) 2583 !$OMP END CRITICAL (OMPC_dfftw_plan_many_dft_2D) 2584 2585 if (plan==NULL_PLAN) then 2586 call wrtout(std_out, "dfftw_plan_many_dft returned NULL_PLAN!") 2587 write(frmt,*)"(a,",rank,"(1x,i0),3(a,i0),a,2(a,",rank,"(1x,i0),2(a,i0),a))" 2588 write(msg,frmt)& 2589 & " n= ",n," howmany= ",howmany," sign= ",sign," flags= ",flags,ch10,& 2590 & " inembed= ",inembed," istride= ",istride," idist=",idist,ch10, & 2591 & " onembed= ",onembed," ostride= ",ostride," odist=",idist,ch10 2592 call wrtout(std_out, msg) 2593 ABI_ERROR("Check FFTW library and/or abinit code") 2594 end if 2595 2596 end function dplan_many_dft_2D
m_fftw3/dplan_many_dft_c2r [ Functions ]
[ Top ] [ m_fftw3 ] [ Functions ]
NAME
FUNCTION
INPUTS
SIDE EFFECTS
SOURCE
2763 function dplan_many_dft_c2r(rank,n,howmany,fin,inembed,istride,idist,fout,onembed,ostride,odist,flags, nthreads) result(plan) 2764 2765 !Arguments ------------------------------------ 2766 !scalars 2767 integer,intent(in) :: rank,howmany,istride,ostride,flags,idist,odist,nthreads 2768 integer,intent(in) :: n(rank),inembed(rank),onembed(rank) 2769 integer(KIND_FFTW_PLAN) :: plan 2770 !arrays 2771 real(dp) :: fin(*),fout(*) 2772 2773 !Local variables------------------------------- 2774 character(len=500) :: msg,frmt 2775 2776 ! ************************************************************************* 2777 2778 !$OMP CRITICAL (OMPC_dplan_many_dft_c2r) 2779 call fftw3_set_nthreads(nthreads) 2780 2781 call dfftw_plan_many_dft_c2r(plan, rank, n, howmany, & 2782 & fin, inembed, istride, idist, fout, onembed, ostride, odist, flags) 2783 !$OMP END CRITICAL (OMPC_dplan_many_dft_c2r) 2784 2785 if (plan==NULL_PLAN) then ! handle the error. 2786 call wrtout(std_out, "dfftw_plan_many_dft_c2r returned NULL_PLAN") 2787 write(frmt,*)"(a,",rank,"(1x,i0),2(a,i0),a,2(a,",rank,"(1x,i0),2(a,i0),a))" 2788 write(msg,frmt)& 2789 & " n = ",n," howmany = ",howmany," flags = ",flags,ch10,& 2790 & " inembed = ",inembed," istride = ",istride," idist = ",idist,ch10,& 2791 & " onembed = ",onembed," ostride = ",ostride," odist = ",idist,ch10 2792 call wrtout(std_out, msg) 2793 ABI_ERROR("Check FFTW library and/or abinit code") 2794 end if 2795 2796 end function dplan_many_dft_c2r
m_fftw3/dplan_many_dft_r2c [ Functions ]
[ Top ] [ m_fftw3 ] [ Functions ]
NAME
FUNCTION
INPUTS
SIDE EFFECTS
SOURCE
2711 !! FIXME technically it should be intent(inout) since FFTW3 can destroy the input 2712 !! for particular flags. 2713 2714 function dplan_many_dft_r2c(rank,n,howmany,fin,inembed,istride,idist,fout,onembed,ostride,odist,flags,nthreads) result(plan) 2715 2716 !Arguments ------------------------------------ 2717 !scalars 2718 integer,intent(in) :: rank,howmany,istride,ostride,flags,idist,odist,nthreads 2719 integer,intent(in) :: n(rank),inembed(rank),onembed(rank) 2720 integer(KIND_FFTW_PLAN) :: plan 2721 !arrays 2722 real(dp) :: fin(*),fout(*) 2723 2724 !Local variables------------------------------- 2725 character(len=500) :: msg,frmt 2726 2727 ! ************************************************************************* 2728 2729 !$OMP CRITICAL (OMPC_dplan_many_dft_r2c) 2730 call fftw3_set_nthreads(nthreads) 2731 2732 call dfftw_plan_many_dft_r2c(plan, rank, n, howmany, & 2733 & fin, inembed, istride, idist, fout, onembed, ostride, odist, flags) 2734 !$OMP END CRITICAL (OMPC_dplan_many_dft_r2c) 2735 2736 if (plan==NULL_PLAN) then ! handle the error. 2737 call wrtout(std_out, "dfftw_plan_many_dft_r2c returned NULL_PLAN") 2738 write(frmt,*)"(a,",rank,"(1x,i0),2(a,i0),a,2(a,",rank,"(1x,i0),2(a,i0),a))" 2739 write(msg,frmt)& 2740 & " n = ",n," howmany = ",howmany," flags = ",flags,ch10,& 2741 & " inembed = ",inembed," istride = ",istride," idist = ",idist,ch10,& 2742 & " onembed = ",onembed," ostride = ",ostride," odist = ",idist,ch10 2743 call wrtout(std_out, msg) 2744 ABI_ERROR("Check FFTW library and/or abinit code") 2745 end if 2746 2747 end function dplan_many_dft_r2c
m_fftw3/fftw3_accrho [ Functions ]
[ Top ] [ m_fftw3 ] [ Functions ]
NAME
fftw3_accrho
FUNCTION
Accumulates the real space density rho from the ndat wavefunctions zf by transforming zf into real space and adding all the amplitudes squared INPUTS: ZF: input array (note the switch of i2 and i3) real(F(i1,i3,i2,idat))=ZF(1,i1,i3,i2,idat) imag(F(i1,i3,i2,idat))=ZF(2,i1,i3,i2,idat) max1 is positive or zero ; m1 >=max1+1 i1= 1... max1+1 corresponds to positive and zero wavevectors 0 ... max1 then, if m1 > max1+1, one has min1=max1-m1+1 and i1= max1+2 ... m1 corresponds to negative wavevectors min1 ... -1 i2 and i3 have a similar definition of range idat=1,ndat md1,md2,md3: Dimension of ZF md2proc=((md2-1)/nproc_fft)+1 ! maximal number of small box 2nd dim slices for one proc weight(ndat)= weight for the density accumulation OUTPUTS: RHOoutput(i1,i2,i3) = RHOinput(i1,i2,i3) + sum on idat of (FFT(ZF))**2 *weight i1=1,n1 , i2=1,n2 , i3=1,n3 comm_fft: MPI communicator nproc_fft: number of processors used as returned by MPI_COMM_SIZE me_fft: [0:nproc_fft-1] number of processor as returned by MPI_COMM_RANK n1,n2,n3: logical dimension of the transform. As transform lengths most products of the prime factors 2,3,5 are allowed. The detailed table with allowed transform lengths can be found in subroutine CTRIG nd1,nd2,nd3: Dimension of RHO nd3proc=((nd3-1)/nproc_fft)+1 ! maximal number of big box 3rd dim slices for one proc NOTES: PERFORMANCE CONSIDERATIONS: The maximum number of processors that can reasonably be used is max(n2/2,n3/2) It is very important to find the optimal value of NCACHE. NCACHE determines the size of the work array ZW, that has to fit into cache. It has therefore to be chosen to equal roughly half the size of the physical cache in units of real*8 numbers. The optimal value of ncache can easily be determined by numerical experimentation. A too large value of ncache leads to a dramatic and sudden decrease of performance, a too small value to a to a slow and less dramatic decrease of performance. If NCACHE is set to a value so small, that not even a single one dimensional transform can be done in the workarray zw, the program stops with an error message.
SOURCE
5793 subroutine fftw3_accrho(cplexwf,ndat,n1,n2,n3,nd1,nd2,nd3,nd3proc,& 5794 & max1,max2,max3,m1,m2,m3,md1,md2proc,md3,comm_fft,nproc_fft,me_fft,zf,rho,weight_r,weight_i) 5795 5796 !Arguments ------------------------------------ 5797 integer,intent(in) :: cplexwf,ndat,n1,n2,n3,nd1,nd2,nd3,nd3proc 5798 integer,intent(in) :: max1,max2,max3,m1,m2,m3,md1,md2proc,md3,comm_fft,nproc_fft,me_fft 5799 real(dp),intent(in) :: zf(2,md1,md3,md2proc,ndat) 5800 real(dp),intent(in) :: weight_r(ndat) , weight_i(ndat) 5801 real(dp),intent(inout) :: rho(nd1,nd2,nd3) 5802 5803 !Local variables------------------------------- 5804 !scalars 5805 #ifdef HAVE_FFTW3 5806 integer,parameter :: unused0=0 5807 integer :: j,i1,idat,ierr,j3glob 5808 integer :: ioption,j2,j3,j2st,jp2st,lzt,m1zt,ma,mb,n1dfft,nnd3 5809 integer :: m2eff,ncache,n1eff,jeff,includelast,lot1,lot2,lot3,nthreads 5810 integer(KIND_FFTW_PLAN) :: bw_plan1_lot,bw_plan1_rest 5811 integer(KIND_FFTW_PLAN) :: bw_plan2_lot,bw_plan2_rest 5812 integer(KIND_FFTW_PLAN) :: bw_plan3_lot,bw_plan3_rest 5813 character(len=500) :: msg 5814 !arrays 5815 real(dp), allocatable :: zmpi1(:,:,:,:),zmpi2(:,:,:,:) ! work arrays for MPI 5816 real(dp),allocatable :: zw(:,:),zt(:,:,:) ! cache work array and array for transpositions 5817 real(dp) :: tsec(2) 5818 5819 ! ************************************************************************* 5820 5821 !ioption=0 ! This was in the old version. 5822 ioption=1 ! This one is needed to be compatible with paral_kgb 5823 5824 !nproc_fft = xmpi_comm_size(comm_fft); me_fft = xmpi_comm_rank(comm_fft) 5825 5826 ! find cache size that gives optimal performance on machine 5827 ncache=2*max(n1,n2,n3,1024) 5828 if (ncache/(2*max(n1,n2,n3)) < 1) then 5829 write(msg,"(5a)") & 5830 & 'ncache has to be enlarged to be able to hold at',ch10,& 5831 & 'least one 1-d FFT of each size even though this will',ch10,& 5832 & 'reduce the performance for shorter transform lengths' 5833 ABI_ERROR(msg) 5834 end if 5835 5836 !Effective m1 and m2 (complex-to-complex or real-to-complex) 5837 n1eff=n1; m2eff=m2 ; m1zt=n1 5838 if (cplexwf==1) then 5839 n1eff=(n1+1)/2; m2eff=m2/2+1; m1zt=2*(n1/2+1) 5840 end if 5841 5842 lzt=m2eff 5843 if (mod(m2eff,2) == 0) lzt=lzt+1 5844 if (mod(m2eff,4) == 0) lzt=lzt+1 5845 5846 ! maximal number of big box 3rd dim slices for all procs 5847 nnd3=nd3proc*nproc_fft 5848 5849 ABI_MALLOC(zw,(2,ncache/2)) 5850 ABI_MALLOC(zt,(2,lzt,m1zt)) 5851 ABI_MALLOC(zmpi2,(2,md1,md2proc,nnd3)) 5852 if (nproc_fft > 1) then 5853 ABI_MALLOC(zmpi1,(2,md1,md2proc,nnd3)) 5854 end if 5855 5856 ! Create plans. 5857 ! The prototype for sfftw_plan_many_dft is: 5858 ! sfftw_plan_many_dft(rank, n, howmany, 5859 ! fin, iembed, istride, idist, 5860 ! fout, oembed, ostride, odist, isign, my_flags) 5861 5862 lot3=ncache/(2*n3) 5863 lot1=ncache/(2*n1) 5864 lot2=ncache/(2*n2) 5865 5866 nthreads = xomp_get_num_threads(open_parallel=.TRUE.) 5867 !nthreads = 1 5868 5869 bw_plan3_lot = dplan_many_dft_2D(1, [n3], lot3, & 5870 & zw, [ncache/2], lot3, 1, & 5871 & zw, [ncache/2], lot3, 1, ABI_FFTW_BACKWARD, ABI_FFTW_ESTIMATE, nthreads) 5872 5873 if (mod(m1, lot3) /= 0) then 5874 bw_plan3_rest = dplan_many_dft_2D(1, [n3], mod(m1, lot3), & 5875 & zw, [ncache/2], lot3, 1, & 5876 & zw, [ncache/2], lot3, 1, ABI_FFTW_BACKWARD, ABI_FFTW_ESTIMATE, nthreads) 5877 end if 5878 5879 bw_plan1_lot = dplan_many_dft_2D(1, [n1], lot1, & 5880 & zw, [ncache/2], lot1, 1, & 5881 & zt, [lzt, m1zt], lzt, 1, ABI_FFTW_BACKWARD, ABI_FFTW_ESTIMATE, nthreads) 5882 5883 if (mod(m2eff, lot1) /= 0) then 5884 bw_plan1_rest = dplan_many_dft_2D(1, [n1], mod(m2eff, lot1), & 5885 & zw, [ncache/2], lot1, 1, & 5886 & zt, [lzt, m1zt], lzt, 1, ABI_FFTW_BACKWARD, ABI_FFTW_ESTIMATE, nthreads) 5887 end if 5888 5889 ! FIXME THis won't work if ixplexwf == 1 5890 bw_plan2_lot = dplan_many_dft_2D(1, [n2], lot2, & 5891 & zw, [ncache/2], lot2, 1, & 5892 & zw, [ncache/2], lot2, 1, ABI_FFTW_BACKWARD, ABI_FFTW_ESTIMATE, nthreads) 5893 5894 if (mod(n1eff, lot2) /= 0) then 5895 bw_plan2_rest = dplan_many_dft_2D(1, [n2], mod(n1eff,lot2), & 5896 & zw, [ncache/2], lot2, 1, & 5897 & zw, [ncache/2], lot2, 1, ABI_FFTW_BACKWARD, ABI_FFTW_ESTIMATE, nthreads) 5898 end if 5899 5900 do idat=1,ndat 5901 ! transform along z axis 5902 ! input: I1,I3,J2,(Jp2) 5903 !lot=ncache/(4*n3) 5904 5905 ! Loop over the y planes treated by this node and trasform n1ddft G_z lines. 5906 do j2=1,md2proc 5907 if (me_fft*md2proc+j2 <= m2eff) then ! MG REMOVED TO BE COSISTENT WITH BACK_WF 5908 do i1=1,m1,lot3 5909 ma=i1 5910 mb=min(i1+(lot3-1),m1) 5911 n1dfft=mb-ma+1 5912 5913 ! zero-pad n1dfft G_z lines 5914 ! input: G1,G3,G2,(Gp2) 5915 ! output: G1,R3,G2,(Gp2) 5916 call fill_cent(md1,md3,lot3,n1dfft,max3,m3,n3,zf(1,i1,1,j2,idat),zw) 5917 5918 ! Transform along z. 5919 if (n1dfft == lot3) then 5920 call dfftw_execute_dft(bw_plan3_lot, zw, zw) 5921 else 5922 call dfftw_execute_dft(bw_plan3_rest, zw, zw) 5923 end if 5924 5925 ! Local rotation. 5926 ! input: G1,R3,G2,(Gp2) 5927 ! output: G1,G2,R3,(Gp2) 5928 call scramble(i1,j2,lot3,n1dfft,md1,n3,md2proc,nnd3,zw,zmpi2) 5929 end do 5930 end if 5931 end do 5932 5933 ! Interprocessor data transposition 5934 ! input: G1,G2,R3,Rp3,(Gp2) 5935 ! output: G1,G2,R3,Gp2,(Rp3) 5936 if (nproc_fft > 1) then 5937 call timab(543,1,tsec) 5938 call xmpi_alltoall(zmpi2,2*md1*md2proc*nd3proc, & 5939 & zmpi1,2*md1*md2proc*nd3proc,comm_fft,ierr) 5940 call timab(543,2,tsec) 5941 end if 5942 5943 ! Loop over the z treated by this node. 5944 do j3=1,nd3proc 5945 j3glob = j3 + me_fft*nd3proc 5946 5947 if (me_fft*nd3proc+j3 <= n3) then 5948 Jp2st=1; J2st=1 5949 5950 ! Loop over G_y in the small box. 5951 do j=1,m2eff,lot1 5952 ma=j 5953 mb=min(j+(lot1-1),m2eff) 5954 n1dfft=mb-ma+1 5955 5956 ! Zero-pad input. 5957 ! input: G1,G2,R3,JG2,(Rp3) 5958 ! output: G2,G1,R3,JG2,(Rp3) 5959 if (nproc_fft == 1) then 5960 call mpiswitch_cent(j3,n1dfft,Jp2st,J2st,lot1,max1,md1,m1,n1,& 5961 & md2proc,nd3proc,nproc_fft,ioption,zmpi2,zw,unused0, unused0,unused0) 5962 else 5963 call mpiswitch_cent(j3,n1dfft,Jp2st,J2st,lot1,max1,md1,m1,n1,& 5964 & md2proc,nd3proc,nproc_fft,ioption,zmpi1,zw, unused0,unused0,unused0) 5965 end if 5966 5967 ! Transform along x 5968 ! input: G2,G1,R3,(Rp3) 5969 ! output: G2,R1,R3,(Rp3) 5970 if (n1dfft == lot1) then 5971 call dfftw_execute_dft(bw_plan1_lot, zw, zt(1,j,1)) 5972 else 5973 call dfftw_execute_dft(bw_plan1_rest, zw, zt(1,j,1)) 5974 end if 5975 5976 end do 5977 5978 ! Transform along y axis (take into account c2c or c2r case). 5979 ! Must loop over the full box. 5980 !lot=ncache/(4*n2) 5981 ! FIXME THis won't work 5982 if (cplexwf==1) then 5983 if (mod(lot2,2) /=0) lot2=lot2-1 ! needed to introduce jeff 5984 end if 5985 5986 do j=1,n1eff,lot2 5987 ma=j 5988 mb=min(j+(lot2-1),n1eff) 5989 n1dfft=mb-ma+1 5990 jeff=j 5991 includelast=1 5992 5993 if (cplexwf==1) then 5994 jeff=2*j-1 5995 includelast=1 5996 if (mb==n1eff .and. n1eff*2/=n1) includelast=0 5997 end if 5998 5999 ! Zero-pad the input. 6000 ! input: G2,R1,R3,(Rp3) 6001 ! output: R1,G2,R3,(Rp3) 6002 if (cplexwf==2) then 6003 call switch_cent(n1dfft,max2,m2,n2,lot2,n1,lzt,zt(1,1,j),zw) 6004 else 6005 call switchreal_cent(includelast,n1dfft,max2,n2,lot2,m1zt,lzt,zt(1,1,jeff),zw) 6006 end if 6007 6008 if (n1dfft == lot2) then 6009 call dfftw_execute_dft(bw_plan2_lot, zw, zw) 6010 else 6011 call dfftw_execute_dft(bw_plan2_rest, zw, zw) 6012 end if 6013 6014 ! Accumulate 6015 call addrho(cplexwf,includelast,nd1,nd2,n2,lot2,n1dfft,& 6016 & zw,rho(jeff,1,j3glob),weight_r(idat),weight_i(idat)) 6017 end do 6018 ! output: i1,i2,j3,(jp3) 6019 6020 end if 6021 end do ! j3 6022 end do ! idat 6023 6024 call dfftw_destroy_plan(bw_plan3_lot) 6025 if (mod(m1, lot3) /= 0) then 6026 call dfftw_destroy_plan(bw_plan3_rest) 6027 end if 6028 6029 call dfftw_destroy_plan(bw_plan1_lot) 6030 if (mod(m2eff, lot1) /= 0) then 6031 call dfftw_destroy_plan(bw_plan1_rest) 6032 end if 6033 6034 call dfftw_destroy_plan(bw_plan2_lot) 6035 if (mod(n1eff, lot2) /= 0) then 6036 call dfftw_destroy_plan(bw_plan2_rest) 6037 end if 6038 6039 ABI_FREE(zmpi2) 6040 ABI_FREE(zw) 6041 ABI_FREE(zt) 6042 if (nproc_fft > 1) then 6043 ABI_FREE(zmpi1) 6044 end if 6045 6046 #else 6047 ABI_ERROR("FFTW3 support not activated") 6048 ABI_UNUSED((/cplexwf,ndat,n1,n2,n3,nd1,nd2,nd3proc/)) 6049 ABI_UNUSED((/ max1,max2,max3,m1,m2,m3,md1,md2proc,md3,comm_fft,nproc_fft,me_fft/)) 6050 ABI_UNUSED((/zf(1,1,1,1,1),rho(1,1,1),weight_r(1),weight_i(1)/)) 6051 #endif 6052 6053 end subroutine fftw3_accrho
m_fftw3/fftw3_alloc_complex1d_dpc [ Functions ]
[ Top ] [ m_fftw3 ] [ Functions ]
NAME
fftw3_alloc_complex1d_dpc
SOURCE
2999 #ifdef HAVE_FFTW3 3000 3001 subroutine fftw3_alloc_complex1d_dpc(size,cptr,fptr) 3002 3003 !Arguments ------------------------------------ 3004 !scalars 3005 integer,intent(in) :: size 3006 complex(dpc),ABI_CONTIGUOUS pointer :: fptr(:) 3007 type(C_PTR),intent(out) :: cptr 3008 3009 ! ************************************************************************* 3010 3011 cptr = fftw_malloc( INT(2*size*C_DOUBLE, KIND=C_SIZE_T)) 3012 if (.not. C_ASSOCIATED(cptr)) then 3013 ABI_ERROR("fftw_malloc returned NULL!") 3014 end if 3015 3016 call c_f_pointer(cptr, fptr, [size]) 3017 3018 end subroutine fftw3_alloc_complex1d_dpc
m_fftw3/fftw3_alloc_complex1d_spc [ Functions ]
[ Top ] [ m_fftw3 ] [ Functions ]
NAME
fftw3_alloc_complex1d_spc
SOURCE
2967 #ifdef HAVE_FFTW3 2968 2969 subroutine fftw3_alloc_complex1d_spc(size,cptr,fptr) 2970 2971 !Arguments ------------------------------------ 2972 !scalars 2973 integer,intent(in) :: size 2974 complex(spc),ABI_CONTIGUOUS pointer :: fptr(:) 2975 type(C_PTR),intent(out) :: cptr 2976 2977 ! ************************************************************************* 2978 2979 cptr = fftw_malloc( INT(2*size*C_FLOAT, KIND=C_SIZE_T)) 2980 if (.not. C_ASSOCIATED(cptr)) then 2981 ABI_ERROR("fftw_malloc returned NULL!") 2982 end if 2983 2984 call c_f_pointer(cptr, fptr, [size]) 2985 2986 end subroutine fftw3_alloc_complex1d_spc
m_fftw3/fftw3_alloc_real1d_dp [ Functions ]
[ Top ] [ m_fftw3 ] [ Functions ]
NAME
fftw3_alloc_real1d_dp
FUNCTION
SOURCE
2903 #ifdef HAVE_FFTW3 2904 2905 subroutine fftw3_alloc_real1d_dp(size,cptr,fptr) 2906 2907 !Arguments ------------------------------------ 2908 !scalars 2909 integer,intent(in) :: size 2910 real(dp),ABI_CONTIGUOUS pointer :: fptr(:) 2911 type(C_PTR),intent(out) :: cptr 2912 2913 ! ************************************************************************* 2914 2915 cptr = fftw_malloc( INT(size*C_DOUBLE, KIND=C_SIZE_T)) 2916 if (.not. C_ASSOCIATED(cptr)) then 2917 ABI_ERROR("fftw_malloc returned NULL!") 2918 end if 2919 2920 call c_f_pointer(cptr, fptr, [size]) 2921 2922 end subroutine fftw3_alloc_real1d_dp
m_fftw3/fftw3_alloc_real2d_dp [ Functions ]
[ Top ] [ m_fftw3 ] [ Functions ]
NAME
fftw3_alloc_real2d_dp
SOURCE
2935 #ifdef HAVE_FFTW3 2936 2937 subroutine fftw3_alloc_real2d_dp(shape,cptr,fptr) 2938 2939 !Arguments ------------------------------------ 2940 !scalars 2941 integer,intent(in) :: shape(2) 2942 real(dp),ABI_CONTIGUOUS pointer :: fptr(:,:) 2943 type(C_PTR),intent(out) :: cptr 2944 2945 ! ************************************************************************* 2946 2947 cptr = fftw_malloc( INT(product(shape)*C_DOUBLE, KIND=C_SIZE_T)) 2948 if (.not. C_ASSOCIATED(cptr)) then 2949 ABI_ERROR("fftw_malloc returned NULL!") 2950 end if 2951 2952 call c_f_pointer(cptr, fptr, shape) 2953 2954 end subroutine fftw3_alloc_real2d_dp
m_fftw3/fftw3_applypot [ Functions ]
[ Top ] [ m_fftw3 ] [ Functions ]
NAME
fftw3_applypot
FUNCTION
Applies the local real space potential to multiple wavefunctions in Fourier space
INPUTS
ZF: Wavefunction (input/output) (note the switch of i2 and i3) real(F(i1,i3,i2,idat))=ZF(1,i1,i3,i2,idat) imag(F(i1,i3,i2,idat))=ZF(2,i1,i3,i2,idat) max1 is positive or zero ; m1 >=max1+1 i1= 1... max1+1 corresponds to positive and zero wavevectors 0 ... max1 then, if m1 > max1+1, one has min1=max1-m1+1 and i1= max1+2 ... m1 corresponds to negative wavevectors min1 ... -1 i2 and i3 have a similar definition of range idat=1,ndat md1,md2,md3: Dimension of ZF (input as well as output), distributed on different procs md2proc=((md2-1)/nproc_fft)+1 maximal number of small box 2nd dim slices for one proc POT: Potential POT(cplex*i1,i2,i3) cplex=1 or 2 , i1=1,n1 , i2=1,n2 , i3=1,n3 nd1,nd2,nd3: dimension of pot comm_fft: MPI communicator nproc_fft: number of processors used as returned by MPI_COMM_SIZE me_fft: [0:nproc_fft-1] number of processor as returned by MPI_COMM_RANK n1,n2,n3: logical dimension of the transform. As transform lengths most products of the prime factors 2,3,5 are allowed. The detailed table with allowed transform lengths can be found in subroutine CTRIG NOTES: PERFORMANCE CONSIDERATIONS: The maximum number of processors that can reasonably be used is max(n2/2,n3/2) It is very important to find the optimal value of NCACHE. NCACHE determines the size of the work array ZW, that has to fit into cache. It has therefore to be chosen to equal roughly half the size of the physical cache in units of real*8 numbers. The optimal value of ncache can easily be determined by numerical experimentation. A too large value of ncache leads to a dramatic and sudden decrease of performance, a too small value to a to a slow and less dramatic decrease of performance. If NCACHE is set to a value so small, that not even a single one dimensional transform can be done in the workarray zw, the program stops with an error message.
SOURCE
5328 subroutine fftw3_applypot(cplexwf,cplex,ndat,n1,n2,n3,nd1,nd2,nd3,nd3proc,& 5329 & max1i,max2i,max3i,m1i,m2i,m3i,md1,md2proc,md3,& 5330 & max1o,max2o,max3o,m1o,m2o,m3o,comm_fft,nproc_fft,me_fft,pot,zf) 5331 5332 !Arguments ------------------------------------ 5333 integer,intent(in) :: cplexwf,cplex,ndat,n1,n2,n3,nd1,nd2,nd3,nd3proc 5334 integer,intent(in) :: max1i,max2i,max3i,m1i,m2i,m3i,md1,md2proc,md3 5335 integer,intent(in) :: max1o,max2o,max3o,m1o,m2o,m3o,comm_fft,nproc_fft,me_fft 5336 real(dp),intent(in) :: pot(cplex*nd1,nd2,nd3) 5337 real(dp),intent(inout) :: zf(2,md1,md3,md2proc,ndat) 5338 5339 !Local variables------------------------------- 5340 !scalars 5341 #ifdef HAVE_FFTW3 5342 integer,parameter :: unused0=0 5343 integer :: j,i1,i2,i3,idat,ierr,j3glob,nthreads 5344 integer :: ioption,j2,j3,lzt,m1zt,ma,mb,n1dfft,nnd3,lot1,lot2,lot3 5345 integer :: m2eff,ncache,n1eff,i1inv,i2inv,i3inv,jeff,includelast,j2stb 5346 integer :: jx,j2stf,Jp2stb,Jp2stf,m2ieff,m2oeff 5347 integer(KIND_FFTW_PLAN) :: bw_plan1_lot,bw_plan1_rest 5348 integer(KIND_FFTW_PLAN) :: bw_plan2_lot,bw_plan2_rest 5349 integer(KIND_FFTW_PLAN) :: bw_plan3_lot,bw_plan3_rest 5350 integer(KIND_FFTW_PLAN) :: fw_plan1_lot,fw_plan1_rest 5351 integer(KIND_FFTW_PLAN) :: fw_plan2_lot,fw_plan2_rest 5352 integer(KIND_FFTW_PLAN) :: fw_plan3_lot,fw_plan3_rest 5353 character(len=500) :: msg 5354 !arrays 5355 real(dp) :: tsec(2) 5356 real(dp), allocatable :: zmpi1(:,:,:,:),zmpi2(:,:,:,:) ! work arrays for MPI 5357 real(dp),allocatable :: zw(:,:),zt(:,:,:) ! cache work array and array for transpositions 5358 ! FFT work arrays 5359 5360 ! ************************************************************************* 5361 5362 !ioption=0 ! This was in the old version. 5363 ioption=1 ! This one is needed to be compatible with paral_kgb 5364 5365 ncache=2*max(n1,n2,n3,1024) 5366 if (ncache/(2*max(n1,n2,n3)) < 1) then 5367 write(msg,"(5a)") & 5368 & 'ncache has to be enlarged to be able to hold at',ch10,& 5369 & 'least one 1-d FFT of each size even though this will',ch10,& 5370 & 'reduce the performance for shorter transform lengths' 5371 ABI_ERROR(msg) 5372 end if 5373 5374 !call wrtout(std_out,"applypot standard ALLTOALL + FFTW3") 5375 5376 ! Effective m1 and m2 (complex-to-complex or real-to-complex) 5377 n1eff=n1; m2ieff=m2i; m2oeff=m2o; m1zt=n1 5378 if (cplexwf==1) then 5379 n1eff=(n1+1)/2; m2ieff=m2i/2+1; m2oeff=m2o/2+1; m1zt=2*(n1/2+1) 5380 end if 5381 5382 m2eff=max(m2ieff,m2oeff) 5383 lzt=m2eff 5384 if (mod(m2eff,2) == 0) lzt=lzt+1 5385 if (mod(m2eff,4) == 0) lzt=lzt+1 5386 5387 ! maximal number of big box 3rd dim slices for all procs 5388 nnd3=nd3proc*nproc_fft 5389 5390 ABI_MALLOC(zw,(2,ncache/2)) 5391 ABI_MALLOC(zt,(2,lzt,m1zt)) 5392 ABI_MALLOC(zmpi2,(2,md1,md2proc,nnd3)) 5393 if (nproc_fft > 1) then 5394 ABI_MALLOC(zmpi1,(2,md1,md2proc,nnd3)) 5395 end if 5396 5397 lot3=ncache/(2*n3) 5398 lot1=ncache/(2*n1) 5399 lot2=ncache/(2*n2) 5400 5401 ! The prototype for sfftw_plan_many_dft is: 5402 ! sfftw_plan_many_dft(rank, n, howmany, 5403 ! fin, iembed, istride, idist, 5404 ! fout, oembed, ostride, odist, isign, my_flags) 5405 5406 nthreads = xomp_get_num_threads(open_parallel=.TRUE.) 5407 !nthreads = 1 5408 5409 ! Create plans for G --> R (see back_wf) 5410 bw_plan3_lot = dplan_many_dft_2D(1, [n3], lot3, & 5411 & zw, [ncache/2], lot3, 1, & 5412 & zw, [ncache/2], lot3, 1, ABI_FFTW_BACKWARD, ABI_FFTW_ESTIMATE, nthreads) 5413 5414 if (mod(m1i, lot3) /= 0) then 5415 bw_plan3_rest = dplan_many_dft_2D(1, [n3], mod(m1i, lot3),& 5416 & zw, [ncache/2], lot3, 1, & 5417 & zw, [ncache/2], lot3, 1, ABI_FFTW_BACKWARD, ABI_FFTW_ESTIMATE, nthreads) 5418 end if 5419 5420 bw_plan1_lot = dplan_many_dft_2D(1, [n1], lot1, & 5421 & zw, [ncache/2], lot1, 1, & 5422 & zt, [lzt, m1zt], lzt, 1, ABI_FFTW_BACKWARD, ABI_FFTW_ESTIMATE, nthreads) 5423 5424 if (mod(m2ieff, lot1) /= 0) then 5425 bw_plan1_rest = dplan_many_dft_2D(1, [n1], mod(m2ieff, lot1), & 5426 & zw, [ncache/2], lot1, 1, & 5427 & zt, [lzt, m1zt], lzt, 1, ABI_FFTW_BACKWARD, ABI_FFTW_ESTIMATE, nthreads) 5428 end if 5429 5430 !TODO this won't work if iclexwf==1 5431 ! Recheck this 5432 bw_plan2_lot = dplan_many_dft_2D(1, [n2], lot2, & 5433 & zw, [ncache/2], lot2, 1, & 5434 & zw, [ncache/2], lot2, 1, ABI_FFTW_BACKWARD, ABI_FFTW_ESTIMATE, nthreads) 5435 5436 if (mod(n1eff, lot2) /= 0) then 5437 bw_plan2_rest = dplan_many_dft_2D(1, [n2], mod(n1eff,lot2), & 5438 & zw, [ncache/2], lot2, 1, & 5439 & zw, [ncache/2], lot2, 1, ABI_FFTW_BACKWARD, ABI_FFTW_ESTIMATE, nthreads) 5440 end if 5441 5442 ! Create plans for G --> R (see forw_wf) 5443 fw_plan3_lot = dplan_many_dft_2D(1, [n3], lot3, & 5444 & zw, [ncache/2], lot3, 1, & 5445 & zw, [ncache/2], lot3, 1, ABI_FFTW_FORWARD, ABI_FFTW_ESTIMATE, nthreads) 5446 5447 if (mod(m1o, lot3) /= 0) then 5448 fw_plan3_rest = dplan_many_dft_2D(1, [n3], mod(m1o, lot3),& 5449 & zw, [ncache/2], lot3, 1, & 5450 & zw, [ncache/2], lot3, 1, ABI_FFTW_FORWARD, ABI_FFTW_ESTIMATE, nthreads) 5451 end if 5452 5453 fw_plan1_lot = dplan_many_dft_2D(1, [n1], lot1,& 5454 & zt, [lzt, m1zt], lzt, 1, & 5455 & zw, [ncache/2], lot1, 1, ABI_FFTW_FORWARD, ABI_FFTW_ESTIMATE, nthreads) 5456 5457 if (mod(m2oeff, lot1) /= 0) then 5458 fw_plan1_rest = dplan_many_dft_2D(1, [n1], mod(m2oeff, lot1),& 5459 & zt, [lzt, m1zt], lzt, 1, & 5460 & zw, [ncache/2], lot1, 1, ABI_FFTW_FORWARD, ABI_FFTW_ESTIMATE, nthreads) 5461 end if 5462 5463 fw_plan2_lot = dplan_many_dft_2D(1, [n2], lot2,& 5464 & zw, [ncache/2], lot2, 1, & 5465 & zw, [ncache/2], lot2, 1, ABI_FFTW_FORWARD, ABI_FFTW_ESTIMATE, nthreads) 5466 5467 if (mod(n1eff, lot2) /= 0) then 5468 fw_plan2_rest = dplan_many_dft_2D(1, [n2], mod(n1eff,lot2),& 5469 & zw, [ncache/2], lot2, 1, & 5470 & zw, [ncache/2], lot2, 1, ABI_FFTW_FORWARD, ABI_FFTW_ESTIMATE, nthreads) 5471 end if 5472 5473 do idat=1,ndat 5474 ! 5475 ! transform along z axis 5476 ! input: G1,G3,G2,(Gp2) 5477 do j2=1,md2proc 5478 if (me_fft*md2proc+j2 <= m2ieff) then 5479 do i1=1,m1i,lot3 5480 ma=i1 5481 mb=min(i1+(lot3-1),m1i) 5482 n1dfft=mb-ma+1 5483 5484 ! zero-pad n1dfft G_z lines 5485 ! input: G1,G3,G2,(Gp2) 5486 call fill_cent(md1,md3,lot3,n1dfft,max3i,m3i,n3,zf(1,i1,1,j2,idat),zw) 5487 5488 if (n1dfft == lot3) then 5489 call dfftw_execute_dft(bw_plan3_lot, zw, zw) 5490 else 5491 call dfftw_execute_dft(bw_plan3_rest, zw, zw) 5492 end if 5493 5494 ! Local rotation. 5495 ! input: G1,R3,G2,(Gp2) 5496 ! output: G1,G2,R3,(Gp2) 5497 call scramble(i1,j2,lot3,n1dfft,md1,n3,md2proc,nnd3,zw,zmpi2) 5498 end do 5499 end if 5500 end do 5501 5502 ! Interprocessor data transposition 5503 ! input: G1,G2,R3,Rp3,(Gp2) 5504 ! output: G1,G2,R3,Gp2,(Rp3) 5505 if (nproc_fft > 1) then 5506 call timab(543,1,tsec) 5507 call xmpi_alltoall(zmpi2,2*md1*md2proc*nd3proc,& 5508 & zmpi1,2*md1*md2proc*nd3proc,comm_fft,ierr) 5509 call timab(543,2,tsec) 5510 end if 5511 5512 do j3=1,nd3proc 5513 j3glob = j3 + me_fft*nd3proc 5514 if (me_fft*nd3proc+j3 <= n3) then 5515 Jp2stb=1; J2stb=1 5516 Jp2stf=1; J2stf=1 5517 5518 ! transform along x axis 5519 do j=1,m2ieff,lot1 5520 ma=j 5521 mb=min(j+(lot1-1),m2ieff) 5522 n1dfft=mb-ma+1 5523 5524 ! Zero-pad input. 5525 ! input: G1,G2,R3,G2,(Rp3) 5526 ! output: G2,G1,R3,G2,(Rp3) 5527 if (nproc_fft == 1) then 5528 call mpiswitch_cent(j3,n1dfft,Jp2stb,J2stb,lot1,max1i,md1,m1i,n1,& 5529 & md2proc,nd3proc,nproc_fft,ioption,zmpi2,zw, unused0, unused0, unused0) 5530 else 5531 call mpiswitch_cent(j3,n1dfft,Jp2stb,J2stb,lot1,max1i,md1,m1i,n1,& 5532 & md2proc,nd3proc,nproc_fft,ioption,zmpi1,zw, unused0, unused0, unused0) 5533 end if 5534 5535 ! Transform along x 5536 ! input: G2,G1,R3,(Rp3) 5537 ! output: G2,R1,R3,(Rp3) 5538 if (n1dfft == lot1) then 5539 call dfftw_execute_dft(bw_plan1_lot, zw, zt(1,j,1)) 5540 else 5541 call dfftw_execute_dft(bw_plan1_rest, zw, zt(1,j,1)) 5542 end if 5543 end do 5544 5545 ! Transform along y axis (take into account c2c or c2r case). 5546 ! Must loop over the full box. 5547 !TODO this won't work 5548 if (cplexwf==1) then 5549 if(mod(lot2,2).ne.0) lot2=lot2-1 ! needed to introduce jeff 5550 end if 5551 5552 do j=1,n1eff,lot2 5553 ma=j 5554 mb=min(j+(lot2-1),n1eff) 5555 n1dfft=mb-ma+1 5556 jeff=j 5557 includelast=1 5558 5559 if (cplexwf==1) then 5560 jeff=2*j-1 5561 includelast=1 5562 if (mb==n1eff .and. n1eff*2/=n1) includelast=0 5563 end if 5564 5565 ! Zero-pad the input. 5566 ! input: G2,R1,R3,(Rp3) 5567 ! output: R1,G2,R3,(Rp3) 5568 if (cplexwf==2) then 5569 call switch_cent(n1dfft,max2i,m2i,n2,lot2,n1,lzt,zt(1,1,jeff),zw) 5570 else 5571 call switchreal_cent(includelast,n1dfft,max2i,n2,lot2,m1zt,lzt,zt(1,1,jeff),zw) 5572 end if 5573 5574 ! input: R1,G2,R3,(Rp3) 5575 ! output: R1,R2,R3,(Rp3) 5576 ! Be careful here 5577 if (n1dfft == lot2) then 5578 call dfftw_execute_dft(bw_plan2_lot, zw, zw) 5579 else 5580 call dfftw_execute_dft(bw_plan2_rest, zw, zw) 5581 end if 5582 5583 ! Multiply with potential in real space 5584 jx=cplex*(jeff-1)+1 5585 call multpot(cplexwf,cplex,includelast,nd1,nd2,n2,lot2,n1dfft,pot(jx,1,j3glob),zw) 5586 5587 ! TRANSFORM BACK IN FOURIER SPACE 5588 ! transform along y axis 5589 ! input: R1,R2,R3,(Rp3) 5590 if (n1dfft == lot2) then 5591 call dfftw_execute_dft(fw_plan2_lot, zw, zw) 5592 else 5593 call dfftw_execute_dft(fw_plan2_rest, zw, zw) 5594 end if 5595 5596 ! input: R1,G2,R3,(Rp3) 5597 ! output: G2,R1,R3,(Rp3) 5598 if (cplexwf==2) then 5599 call unswitch_cent(n1dfft,max2o,m2o,n2,lot2,n1,lzt,zw,zt(1,1,jeff)) 5600 else 5601 call unswitchreal_cent(n1dfft,max2o,n2,lot2,n1,lzt,zw,zt(1,1,jeff)) 5602 end if 5603 end do ! j 5604 5605 ! transform along x axis 5606 ! input: R2,R1,R3,(Rp3) 5607 ! output: R2,G1,R3,(Rp3) 5608 do j=1,m2oeff,lot1 5609 ma=j 5610 mb=min(j+(lot1-1),m2oeff) 5611 n1dfft=mb-ma+1 5612 5613 if (n1dfft == lot1) then 5614 call dfftw_execute_dft(fw_plan1_lot, zt(1,j,1), zw) 5615 else 5616 call dfftw_execute_dft(fw_plan1_rest, zt(1,j,1), zw) 5617 end if 5618 5619 ! input: G2,G1,R3,Gp2,(Rp3) 5620 ! output: G1,G2,R3,Gp2,(Rp3) 5621 if (nproc_fft == 1) then 5622 call unmpiswitch_cent(j3,n1dfft,Jp2stf,J2stf,lot1,max1o,md1,m1o,n1,& 5623 & md2proc,nd3proc,nproc_fft,ioption,zw,zmpi2) 5624 else 5625 call unmpiswitch_cent(j3,n1dfft,Jp2stf,J2stf,lot1,max1o,md1,m1o,n1,& 5626 & md2proc,nd3proc,nproc_fft,ioption,zw,zmpi1) 5627 end if 5628 end do ! j 5629 end if 5630 end do 5631 5632 ! Interprocessor data transposition 5633 ! input: G1,G2,R3,Gp2,(Rp3) 5634 ! output: G1,G2,R3,Rp3,(Gp2) 5635 if (nproc_fft > 1) then 5636 call timab(544,1,tsec) 5637 call xmpi_alltoall(zmpi1,2*md1*md2proc*nd3proc, & 5638 & zmpi2,2*md1*md2proc*nd3proc,comm_fft,ierr) 5639 call timab(544,2,tsec) 5640 end if 5641 5642 ! transform along z axis 5643 ! input: G1,G2,R3,(Gp2) 5644 !lot=ncache/(4*n3) 5645 do j2=1,md2proc 5646 if (me_fft*md2proc+j2 <= m2oeff) then 5647 do i1=1,m1o,lot3 5648 ma=i1 5649 mb=min(i1+(lot3-1),m1o) 5650 n1dfft=mb-ma+1 5651 5652 ! input: G1,G2,R3,(Gp2) 5653 ! output: G1,R3,G2,(Gp2) 5654 call unscramble(i1,j2,lot3,n1dfft,md1,n3,md2proc,nnd3,zmpi2,zw) 5655 5656 if (n1dfft == lot3) then 5657 call dfftw_execute_dft(fw_plan3_lot, zw, zw) 5658 else 5659 call dfftw_execute_dft(fw_plan3_rest, zw, zw) 5660 end if 5661 5662 call unfill_cent(md1,md3,lot3,n1dfft,max3o,m3o,n3,zw,zf(1,i1,1,j2,idat)) 5663 ! output: G1,G3,G2,(Gp2) 5664 end do 5665 end if 5666 end do 5667 5668 ! Complete missing values with complex conjugate 5669 ! Inverse of ix is located at nx+2-ix , except for ix=1, for which it is 1. 5670 if (cplexwf==1) then 5671 do i3=1,m3o 5672 i3inv=m3o+2-i3 5673 if (i3==1) i3inv=1 5674 if (m2oeff>1)then 5675 do i2=2,m2oeff 5676 i2inv=m2o+2-i2 5677 zf(1,1,i3inv,i2inv,idat)= zf(1,1,i3,i2,idat) 5678 zf(2,1,i3inv,i2inv,idat)=-zf(2,1,i3,i2,idat) 5679 do i1=2,m1o 5680 i1inv=m1o+2-i1 5681 zf(1,i1inv,i3inv,i2inv,idat)= zf(1,i1,i3,i2,idat) 5682 zf(2,i1inv,i3inv,i2inv,idat)=-zf(2,i1,i3,i2,idat) 5683 end do 5684 end do 5685 end if 5686 end do 5687 end if 5688 5689 end do ! idat 5690 5691 call dfftw_destroy_plan(bw_plan3_lot) 5692 if (mod(m1i, lot3) /= 0) then 5693 call dfftw_destroy_plan(bw_plan3_rest) 5694 end if 5695 5696 call dfftw_destroy_plan(bw_plan1_lot) 5697 if (mod(m2ieff, lot1) /= 0) then 5698 call dfftw_destroy_plan(bw_plan1_rest) 5699 end if 5700 5701 call dfftw_destroy_plan(bw_plan2_lot) 5702 if (mod(n1eff, lot2) /= 0) then 5703 call dfftw_destroy_plan(bw_plan2_rest) 5704 end if 5705 5706 call dfftw_destroy_plan(fw_plan3_lot) 5707 if (mod(m1o, lot3) /= 0) then 5708 call dfftw_destroy_plan(fw_plan3_rest) 5709 end if 5710 5711 call dfftw_destroy_plan(fw_plan1_lot) 5712 if (mod(m2oeff, lot1) /= 0) then 5713 call dfftw_destroy_plan(fw_plan1_rest) 5714 end if 5715 5716 call dfftw_destroy_plan(fw_plan2_lot) 5717 if (mod(n1eff, lot2) /= 0) then 5718 call dfftw_destroy_plan(fw_plan2_rest) 5719 end if 5720 5721 ABI_FREE(zmpi2) 5722 ABI_FREE(zw) 5723 ABI_FREE(zt) 5724 if (nproc_fft > 1) then 5725 ABI_FREE(zmpi1) 5726 end if 5727 5728 #else 5729 ABI_ERROR("FFTW3 support not activated") 5730 ABI_UNUSED((/cplexwf,cplex,ndat,n1,n2,n3,nd1,nd2,nd3,nd3proc/)) 5731 ABI_UNUSED((/max1i,max2i,max3i,m1i,m2i,m3i,md1,md2proc,md3/)) 5732 ABI_UNUSED((/max1o,max2o,max3o,m1o,m2o,m3o,comm_fft,nproc_fft,me_fft/)) 5733 ABI_UNUSED((/pot(1,1,1),zf(1,1,1,1,1)/)) 5734 #endif 5735 5736 end subroutine fftw3_applypot
m_fftw3/fftw3_applypot_many [ Functions ]
[ Top ] [ m_fftw3 ] [ Functions ]
NAME
fftw3_applypot_many
FUNCTION
Applies the local real space potential to multiple wavefunctions in Fourier space
INPUTS
ZF: Wavefunction (input/output) (note the switch of i2 and i3) real(F(i1,i3,i2,idat))=ZF(1,i1,i3,i2,idat) imag(F(i1,i3,i2,idat))=ZF(2,i1,i3,i2,idat) max1 is positive or zero ; m1 >=max1+1 i1= 1... max1+1 corresponds to positive and zero wavevectors 0 ... max1 then, if m1 > max1+1, one has min1=max1-m1+1 and i1= max1+2 ... m1 corresponds to negative wavevectors min1 ... -1 i2 and i3 have a similar definition of range idat=1,ndat md1,md2,md3: Dimension of ZF (input as well as output), distributed on different procs md2proc=((md2-1)/nproc_fft)+1 maximal number of small box 2nd dim slices for one proc POT: Potential POT(cplex*i1,i2,i3) cplex=1 or 2 , i1=1,n1 , i2=1,n2 , i3=1,n3 nd1,nd2,nd3: dimension of pot comm_fft: MPI communicator nproc_fft: number of processors used as returned by MPI_COMM_SIZE me_fft: [0:nproc_fft-1] number of processor as returned by MPI_COMM_RANK n1,n2,n3: logical dimension of the transform. As transform lengths most products of the prime factors 2,3,5 are allowed. The detailed table with allowed transform lengths can be found in subroutine CTRIG NOTES: PERFORMANCE CONSIDERATIONS: The maximum number of processors that can reasonably be used is max(n2/2,n3/2) It is very important to find the optimal value of NCACHE. NCACHE determines the size of the work array ZW, that has to fit into cache. It has therefore to be chosen to equal roughly half the size of the physical cache in units of real*8 numbers. The optimal value of ncache can easily be determined by numerical experimentation. A too large value of ncache leads to a dramatic and sudden decrease of performance, a too small value to a to a slow and less dramatic decrease of performance. If NCACHE is set to a value so small, that not even a single one dimensional transform can be done in the workarray zw, the program stops with an error message.
SOURCE
6798 subroutine fftw3_applypot_many(cplexwf,cplex,ndat,n1,n2,n3,nd1,nd2,nd3,nd3proc,& 6799 & max1i,max2i,max3i,m1i,m2i,m3i,md1,md2proc,md3,& 6800 & max1o,max2o,max3o,m1o,m2o,m3o,comm_fft,nproc_fft,me_fft,pot,zf) 6801 6802 !Arguments ------------------------------------ 6803 integer,intent(in) :: cplexwf,cplex,ndat,n1,n2,n3,nd1,nd2,nd3,nd3proc 6804 integer,intent(in) :: max1i,max2i,max3i,m1i,m2i,m3i,md1,md2proc,md3 6805 integer,intent(in) :: max1o,max2o,max3o,m1o,m2o,m3o,comm_fft,nproc_fft,me_fft 6806 real(dp),intent(in) :: pot(cplex*nd1,nd2,nd3) 6807 real(dp),intent(inout) :: zf(2,md1,md3,md2proc,ndat) 6808 6809 !Local variables------------------------------- 6810 !scalars 6811 #ifdef HAVE_FFTW3 6812 integer,parameter :: unused0=0 6813 integer :: j,i1,i2,i3,idat,ierr,j3glob,nthreads 6814 integer :: ioption,j2,j3,lzt,m1zt,ma,mb,n1dfft,nnd3,lot1,lot2,lot3 6815 integer :: m2eff,ncache,n1eff,i1inv,i2inv,i3inv,jeff,includelast,j2stb 6816 integer :: jx,j2stf,Jp2stb,Jp2stf,m2ieff,m2oeff 6817 integer(KIND_FFTW_PLAN) :: bw_plan1_lot,bw_plan1_rest 6818 integer(KIND_FFTW_PLAN) :: bw_plan2_lot,bw_plan2_rest 6819 integer(KIND_FFTW_PLAN) :: bw_plan3_lot,bw_plan3_rest 6820 integer(KIND_FFTW_PLAN) :: fw_plan1_lot,fw_plan1_rest 6821 integer(KIND_FFTW_PLAN) :: fw_plan2_lot,fw_plan2_rest 6822 integer(KIND_FFTW_PLAN) :: fw_plan3_lot,fw_plan3_rest 6823 character(len=500) :: msg 6824 !arrays 6825 integer :: requests(ndat) 6826 real(dp) :: tsec(2) 6827 real(dp) ABI_ASYNC, allocatable :: zmpi1(:,:,:,:,:),zmpi2(:,:,:,:,:) ! work arrays for MPI 6828 real(dp),allocatable :: zw(:,:),zt(:,:,:) ! cache work array and array for transpositions 6829 ! FFT work arrays 6830 6831 ! ************************************************************************* 6832 6833 !ioption=0 ! This was in the old version. 6834 ioption=1 ! This one is needed to be compatible with paral_kgb 6835 6836 ncache=2*max(n1,n2,n3,1024) 6837 if (ncache/(2*max(n1,n2,n3)) < 1) then 6838 write(msg,"(5a)") & 6839 & 'ncache has to be enlarged to be able to hold at',ch10,& 6840 & 'least one 1-d FFT of each size even though this will',ch10,& 6841 & 'reduce the performance for shorter transform lengths' 6842 ABI_ERROR(msg) 6843 end if 6844 6845 !call wrtout(std_out,"applypot with non-blocking IALLTOALL + FFTW3") 6846 !write(std_out,"(a,i0)")"in applypot_many with ndat: ",ndat 6847 6848 ! Effective m1 and m2 (complex-to-complex or real-to-complex) 6849 n1eff=n1; m2ieff=m2i; m2oeff=m2o; m1zt=n1 6850 if (cplexwf==1) then 6851 n1eff=(n1+1)/2; m2ieff=m2i/2+1; m2oeff=m2o/2+1; m1zt=2*(n1/2+1) 6852 end if 6853 6854 m2eff=max(m2ieff,m2oeff) 6855 lzt=m2eff 6856 if (mod(m2eff,2) == 0) lzt=lzt+1 6857 if (mod(m2eff,4) == 0) lzt=lzt+1 6858 6859 ! maximal number of big box 3rd dim slices for all procs 6860 nnd3=nd3proc*nproc_fft 6861 6862 ABI_MALLOC(zw,(2,ncache/2)) 6863 ABI_MALLOC(zt,(2,lzt,m1zt)) 6864 ABI_MALLOC(zmpi2,(2,md1,md2proc,nnd3,ndat)) 6865 if (nproc_fft > 1) then 6866 ABI_MALLOC(zmpi1,(2,md1,md2proc,nnd3,ndat)) 6867 end if 6868 6869 lot3=ncache/(2*n3) 6870 lot1=ncache/(2*n1) 6871 lot2=ncache/(2*n2) 6872 6873 nthreads = xomp_get_num_threads(open_parallel=.TRUE.) 6874 !nthreads = 1 6875 6876 ! The prototype for sfftw_plan_many_dft is: 6877 ! sfftw_plan_many_dft(rank, n, howmany, 6878 ! fin, iembed, istride, idist, 6879 ! fout, oembed, ostride, odist, isign, my_flags) 6880 6881 ! Create plans for G --> R (see back_wf) 6882 bw_plan3_lot = dplan_many_dft_2D(1, [n3], lot3, & 6883 & zw, [ncache/2], lot3, 1, & 6884 & zw, [ncache/2], lot3, 1, ABI_FFTW_BACKWARD, ABI_FFTW_ESTIMATE, nthreads) 6885 6886 if (mod(m1i, lot3) /= 0) then 6887 bw_plan3_rest = dplan_many_dft_2D(1, [n3], mod(m1i, lot3),& 6888 & zw, [ncache/2], lot3, 1, & 6889 & zw, [ncache/2], lot3, 1, ABI_FFTW_BACKWARD, ABI_FFTW_ESTIMATE, nthreads) 6890 end if 6891 6892 bw_plan1_lot = dplan_many_dft_2D(1, [n1], lot1, & 6893 & zw, [ncache/2], lot1, 1, & 6894 & zt, [lzt, m1zt], lzt, 1, ABI_FFTW_BACKWARD, ABI_FFTW_ESTIMATE, nthreads) 6895 6896 if (mod(m2ieff, lot1) /= 0) then 6897 bw_plan1_rest = dplan_many_dft_2D(1, [n1], mod(m2ieff, lot1), & 6898 & zw, [ncache/2], lot1, 1, & 6899 & zt, [lzt, m1zt], lzt, 1, ABI_FFTW_BACKWARD, ABI_FFTW_ESTIMATE, nthreads) 6900 end if 6901 6902 !TODO this won't work if iclexwf==1 6903 ! Recheck this 6904 bw_plan2_lot = dplan_many_dft_2D(1, [n2], lot2, & 6905 & zw, [ncache/2], lot2, 1, & 6906 & zw, [ncache/2], lot2, 1, ABI_FFTW_BACKWARD, ABI_FFTW_ESTIMATE, nthreads) 6907 6908 if (mod(n1eff, lot2) /= 0) then 6909 bw_plan2_rest = dplan_many_dft_2D(1, [n2], mod(n1eff,lot2), & 6910 & zw, [ncache/2], lot2, 1, & 6911 & zw, [ncache/2], lot2, 1, ABI_FFTW_BACKWARD, ABI_FFTW_ESTIMATE, nthreads) 6912 end if 6913 6914 ! Create plans for G --> R (see forw_wf) 6915 fw_plan3_lot = dplan_many_dft_2D(1, [n3], lot3, & 6916 & zw, [ncache/2], lot3, 1, & 6917 & zw, [ncache/2], lot3, 1, ABI_FFTW_FORWARD, ABI_FFTW_ESTIMATE, nthreads) 6918 6919 if (mod(m1o, lot3) /= 0) then 6920 fw_plan3_rest = dplan_many_dft_2D(1, [n3], mod(m1o, lot3),& 6921 & zw, [ncache/2], lot3, 1, & 6922 & zw, [ncache/2], lot3, 1, ABI_FFTW_FORWARD, ABI_FFTW_ESTIMATE, nthreads) 6923 end if 6924 6925 fw_plan1_lot = dplan_many_dft_2D(1, [n1], lot1,& 6926 & zt, [lzt, m1zt], lzt, 1, & 6927 & zw, [ncache/2], lot1, 1, ABI_FFTW_FORWARD, ABI_FFTW_ESTIMATE, nthreads) 6928 6929 if (mod(m2oeff, lot1) /= 0) then 6930 fw_plan1_rest = dplan_many_dft_2D(1, [n1], mod(m2oeff, lot1),& 6931 & zt, [lzt, m1zt], lzt, 1, & 6932 & zw, [ncache/2], lot1, 1, ABI_FFTW_FORWARD, ABI_FFTW_ESTIMATE, nthreads) 6933 end if 6934 6935 fw_plan2_lot = dplan_many_dft_2D(1, [n2], lot2,& 6936 & zw, [ncache/2], lot2, 1, & 6937 & zw, [ncache/2], lot2, 1, ABI_FFTW_FORWARD, ABI_FFTW_ESTIMATE, nthreads) 6938 6939 if (mod(n1eff, lot2) /= 0) then 6940 fw_plan2_rest = dplan_many_dft_2D(1, [n2], mod(n1eff,lot2),& 6941 & zw, [ncache/2], lot2, 1, & 6942 & zw, [ncache/2], lot2, 1, ABI_FFTW_FORWARD, ABI_FFTW_ESTIMATE, nthreads) 6943 end if 6944 6945 ! Here we take advantage of non-blocking IALLTOALL: 6946 ! Perform the first step of MPI-FFT for ndat wavefunctions. 6947 do idat=1,ndat 6948 ! 6949 ! transform along z axis 6950 ! input: G1,G3,G2,(Gp2) 6951 do j2=1,md2proc 6952 if (me_fft*md2proc+j2 <= m2ieff) then 6953 do i1=1,m1i,lot3 6954 ma=i1 6955 mb=min(i1+(lot3-1),m1i) 6956 n1dfft=mb-ma+1 6957 6958 ! zero-pad n1dfft G_z lines 6959 ! input: G1,G3,G2,(Gp2) 6960 call fill_cent(md1,md3,lot3,n1dfft,max3i,m3i,n3,zf(1,i1,1,j2,idat),zw) 6961 6962 if (n1dfft == lot3) then 6963 call dfftw_execute_dft(bw_plan3_lot, zw, zw) 6964 else 6965 call dfftw_execute_dft(bw_plan3_rest, zw, zw) 6966 end if 6967 6968 ! Local rotation. 6969 ! input: G1,R3,G2,(Gp2) 6970 ! output: G1,G2,R3,(Gp2) 6971 call scramble(i1,j2,lot3,n1dfft,md1,n3,md2proc,nnd3,zw,zmpi2(:,:,:,:,idat)) 6972 end do 6973 end if 6974 end do 6975 6976 ! Interprocessor data transposition 6977 ! input: G1,G2,R3,Rp3,(Gp2) 6978 ! output: G1,G2,R3,Gp2,(Rp3) 6979 if (nproc_fft > 1) then 6980 call timab(543,1,tsec) 6981 call xmpi_ialltoall(zmpi2(:,:,:,:,idat),2*md1*md2proc*nd3proc,& 6982 & zmpi1(:,:,:,:,idat),2*md1*md2proc*nd3proc,comm_fft,requests(idat)) 6983 call timab(543,2,tsec) 6984 end if 6985 end do ! idat 6986 6987 ! The second step of MPI-FFT 6988 do idat=1,ndat 6989 ! Make sure communication is completed. 6990 if (nproc_fft>1) call xmpi_wait(requests(idat),ierr) 6991 6992 do j3=1,nd3proc 6993 j3glob = j3 + me_fft*nd3proc 6994 if (me_fft*nd3proc+j3 <= n3) then 6995 Jp2stb=1; J2stb=1 6996 Jp2stf=1; J2stf=1 6997 6998 ! transform along x axis 6999 do j=1,m2ieff,lot1 7000 ma=j 7001 mb=min(j+(lot1-1),m2ieff) 7002 n1dfft=mb-ma+1 7003 7004 ! Zero-pad input. 7005 ! input: G1,G2,R3,G2,(Rp3) 7006 ! output: G2,G1,R3,G2,(Rp3) 7007 if (nproc_fft == 1) then 7008 call mpiswitch_cent(j3,n1dfft,Jp2stb,J2stb,lot1,max1i,md1,m1i,n1,& 7009 & md2proc,nd3proc,nproc_fft,ioption,zmpi2(:,:,:,:,idat),zw, unused0, unused0, unused0) 7010 else 7011 call mpiswitch_cent(j3,n1dfft,Jp2stb,J2stb,lot1,max1i,md1,m1i,n1,& 7012 & md2proc,nd3proc,nproc_fft,ioption,zmpi1(:,:,:,:,idat),zw, unused0, unused0, unused0) 7013 end if 7014 7015 ! Transform along x 7016 ! input: G2,G1,R3,(Rp3) 7017 ! output: G2,R1,R3,(Rp3) 7018 if (n1dfft == lot1) then 7019 call dfftw_execute_dft(bw_plan1_lot, zw, zt(1,j,1)) 7020 else 7021 call dfftw_execute_dft(bw_plan1_rest, zw, zt(1,j,1)) 7022 end if 7023 end do 7024 7025 ! Transform along y axis (take into account c2c or c2r case). 7026 ! Must loop over the full box. 7027 !TODO this won't work 7028 if (cplexwf==1) then 7029 if(mod(lot2,2).ne.0) lot2=lot2-1 ! needed to introduce jeff 7030 end if 7031 7032 do j=1,n1eff,lot2 7033 ma=j 7034 mb=min(j+(lot2-1),n1eff) 7035 n1dfft=mb-ma+1 7036 jeff=j 7037 includelast=1 7038 7039 if (cplexwf==1) then 7040 jeff=2*j-1 7041 includelast=1 7042 if (mb==n1eff .and. n1eff*2/=n1) includelast=0 7043 end if 7044 7045 ! Zero-pad the input. 7046 ! input: G2,R1,R3,(Rp3) 7047 ! output: R1,G2,R3,(Rp3) 7048 if (cplexwf==2) then 7049 call switch_cent(n1dfft,max2i,m2i,n2,lot2,n1,lzt,zt(1,1,jeff),zw) 7050 else 7051 call switchreal_cent(includelast,n1dfft,max2i,n2,lot2,m1zt,lzt,zt(1,1,jeff),zw) 7052 end if 7053 7054 ! input: R1,G2,R3,(Rp3) 7055 ! output: R1,R2,R3,(Rp3) 7056 ! Be careful here 7057 if (n1dfft == lot2) then 7058 call dfftw_execute_dft(bw_plan2_lot, zw, zw) 7059 else 7060 call dfftw_execute_dft(bw_plan2_rest, zw, zw) 7061 end if 7062 7063 ! Multiply with potential in real space 7064 jx=cplex*(jeff-1)+1 7065 call multpot(cplexwf,cplex,includelast,nd1,nd2,n2,lot2,n1dfft,pot(jx,1,j3glob),zw) 7066 7067 ! TRANSFORM BACK IN FOURIER SPACE 7068 ! transform along y axis 7069 ! input: R1,R2,R3,(Rp3) 7070 if (n1dfft == lot2) then 7071 call dfftw_execute_dft(fw_plan2_lot, zw, zw) 7072 else 7073 call dfftw_execute_dft(fw_plan2_rest, zw, zw) 7074 end if 7075 7076 ! input: R1,G2,R3,(Rp3) 7077 ! output: G2,R1,R3,(Rp3) 7078 if (cplexwf==2) then 7079 call unswitch_cent(n1dfft,max2o,m2o,n2,lot2,n1,lzt,zw,zt(1,1,jeff)) 7080 else 7081 call unswitchreal_cent(n1dfft,max2o,n2,lot2,n1,lzt,zw,zt(1,1,jeff)) 7082 end if 7083 end do ! j 7084 7085 ! transform along x axis 7086 ! input: R2,R1,R3,(Rp3) 7087 ! output: R2,G1,R3,(Rp3) 7088 do j=1,m2oeff,lot1 7089 ma=j 7090 mb=min(j+(lot1-1),m2oeff) 7091 n1dfft=mb-ma+1 7092 7093 if (n1dfft == lot1) then 7094 call dfftw_execute_dft(fw_plan1_lot, zt(1,j,1), zw) 7095 else 7096 call dfftw_execute_dft(fw_plan1_rest, zt(1,j,1), zw) 7097 end if 7098 7099 ! input: G2,G1,R3,Gp2,(Rp3) 7100 ! output: G1,G2,R3,Gp2,(Rp3) 7101 if (nproc_fft == 1) then 7102 call unmpiswitch_cent(j3,n1dfft,Jp2stf,J2stf,lot1,max1o,md1,m1o,n1,& 7103 & md2proc,nd3proc,nproc_fft,ioption,zw,zmpi2(:,:,:,:,idat)) 7104 else 7105 call unmpiswitch_cent(j3,n1dfft,Jp2stf,J2stf,lot1,max1o,md1,m1o,n1,& 7106 & md2proc,nd3proc,nproc_fft,ioption,zw,zmpi1(:,:,:,:,idat)) 7107 end if 7108 end do ! j 7109 end if 7110 end do 7111 7112 ! Interprocessor data transposition 7113 ! input: G1,G2,R3,Gp2,(Rp3) 7114 ! output: G1,G2,R3,Rp3,(Gp2) 7115 if (nproc_fft > 1) then 7116 call timab(544,1,tsec) 7117 call xmpi_ialltoall(zmpi1(:,:,:,:,idat),2*md1*md2proc*nd3proc, & 7118 & zmpi2(:,:,:,:,idat),2*md1*md2proc*nd3proc,comm_fft,requests(idat)) 7119 call timab(544,2,tsec) 7120 end if 7121 end do 7122 7123 do idat=1,ndat 7124 if (nproc_fft>1) call xmpi_wait(requests(idat),ierr) 7125 ! transform along z axis 7126 ! input: G1,G2,R3,(Gp2) 7127 !lot=ncache/(4*n3) 7128 do j2=1,md2proc 7129 if (me_fft*md2proc+j2 <= m2oeff) then 7130 do i1=1,m1o,lot3 7131 ma=i1 7132 mb=min(i1+(lot3-1),m1o) 7133 n1dfft=mb-ma+1 7134 7135 ! input: G1,G2,R3,(Gp2) 7136 ! output: G1,R3,G2,(Gp2) 7137 call unscramble(i1,j2,lot3,n1dfft,md1,n3,md2proc,nnd3,zmpi2(:,:,:,:,idat),zw) 7138 7139 if (n1dfft == lot3) then 7140 call dfftw_execute_dft(fw_plan3_lot, zw, zw) 7141 else 7142 call dfftw_execute_dft(fw_plan3_rest, zw, zw) 7143 end if 7144 7145 call unfill_cent(md1,md3,lot3,n1dfft,max3o,m3o,n3,zw,zf(1,i1,1,j2,idat)) 7146 ! output: G1,G3,G2,(Gp2) 7147 end do 7148 end if 7149 end do 7150 7151 ! Complete missing values with complex conjugate 7152 ! Inverse of ix is located at nx+2-ix , except for ix=1, for which it is 1. 7153 if (cplexwf==1) then 7154 do i3=1,m3o 7155 i3inv=m3o+2-i3 7156 if (i3==1) i3inv=1 7157 if (m2oeff>1)then 7158 do i2=2,m2oeff 7159 i2inv=m2o+2-i2 7160 zf(1,1,i3inv,i2inv,idat)= zf(1,1,i3,i2,idat) 7161 zf(2,1,i3inv,i2inv,idat)=-zf(2,1,i3,i2,idat) 7162 do i1=2,m1o 7163 i1inv=m1o+2-i1 7164 zf(1,i1inv,i3inv,i2inv,idat)= zf(1,i1,i3,i2,idat) 7165 zf(2,i1inv,i3inv,i2inv,idat)=-zf(2,i1,i3,i2,idat) 7166 end do 7167 end do 7168 end if 7169 end do 7170 end if 7171 7172 end do ! idat 7173 7174 call dfftw_destroy_plan(bw_plan3_lot) 7175 if (mod(m1i, lot3) /= 0) then 7176 call dfftw_destroy_plan(bw_plan3_rest) 7177 end if 7178 7179 call dfftw_destroy_plan(bw_plan1_lot) 7180 if (mod(m2ieff, lot1) /= 0) then 7181 call dfftw_destroy_plan(bw_plan1_rest) 7182 end if 7183 7184 call dfftw_destroy_plan(bw_plan2_lot) 7185 if (mod(n1eff, lot2) /= 0) then 7186 call dfftw_destroy_plan(bw_plan2_rest) 7187 end if 7188 7189 call dfftw_destroy_plan(fw_plan3_lot) 7190 if (mod(m1o, lot3) /= 0) then 7191 call dfftw_destroy_plan(fw_plan3_rest) 7192 end if 7193 7194 call dfftw_destroy_plan(fw_plan1_lot) 7195 if (mod(m2oeff, lot1) /= 0) then 7196 call dfftw_destroy_plan(fw_plan1_rest) 7197 end if 7198 7199 call dfftw_destroy_plan(fw_plan2_lot) 7200 if (mod(n1eff, lot2) /= 0) then 7201 call dfftw_destroy_plan(fw_plan2_rest) 7202 end if 7203 7204 ABI_FREE(zmpi2) 7205 ABI_FREE(zw) 7206 ABI_FREE(zt) 7207 if (nproc_fft > 1) then 7208 ABI_FREE(zmpi1) 7209 end if 7210 7211 #else 7212 ABI_ERROR("FFTW3 support not activated") 7213 ABI_UNUSED((/cplexwf,cplex,ndat,n1,n2,n3,nd1,nd2,nd3,nd3proc/)) 7214 ABI_UNUSED((/max1i,max2i,max3i,m1i,m2i,m3i,md1,md2proc,md3/)) 7215 ABI_UNUSED((/max1o,max2o,max3o,m1o,m2o,m3o,comm_fft,nproc_fft,me_fft/)) 7216 ABI_UNUSED((/pot(1,1,1),zf(1,1,1,1,1)/)) 7217 #endif 7218 7219 end subroutine fftw3_applypot_many
m_fftw3/fftw3_c2c_ip_dpc [ Functions ]
[ Top ] [ m_fftw3 ] [ Functions ]
NAME
fftw3_c2c_ip_dpc
FUNCTION
Driver routine for in-place 3D complex-complex FFT.
INPUTS
nx,ny,nz=Number of points along the three directions. ldx,ldy,ldz=Physical dimensions of the array. ndat=Number of FFTs to be done. iscale=0 if G --> R FFT should not be scaled. isign= +1 : ff(G) => ff(R); -1 : ff(R) => ff(G) [fftw_flags]=Flags used to create the plan. They can be combined with the "+" operator. Defaults to ABI_FFTW_ESTIMATE.
SIDE EFFECTS
ff(ldx*ldy*ldz*ndat)= In input: the complex array to be transformed. In output: the Fourier transformed in the space specified by isign.
SOURCE
1561 subroutine fftw3_c2c_ip_dpc(nx, ny, nz, ldx, ldy, ldz, ndat, iscale, isign, ff, fftw_flags) 1562 1563 !Arguments ------------------------------------ 1564 !scalars 1565 integer,intent(in) :: nx,ny,nz,ldx,ldy,ldz,ndat,iscale,isign 1566 integer,optional,intent(in) :: fftw_flags 1567 !arrays 1568 complex(dpc),intent(inout) :: ff(ldx*ldy*ldz*ndat) 1569 1570 #ifdef HAVE_FFTW3 1571 !Local variables------------------------------- 1572 !scalars 1573 integer,parameter :: rank3=3,nt_all=-1 1574 integer :: my_flags,dist,stride 1575 integer(KIND_FFTW_PLAN) :: my_plan 1576 !arrays 1577 integer :: embed(rank3),n(rank3) 1578 1579 ! ************************************************************************* 1580 1581 my_flags=ABI_FFTW_ESTIMATE; if (PRESENT(fftw_flags)) my_flags=fftw_flags 1582 1583 stride = 1 1584 dist = ldx*ldy*ldz 1585 embed = [ldx, ldy, ldz] 1586 n = [nx, ny, nz] 1587 1588 my_plan = fftw3_plan_many_dft(rank3, n, ndat, ff, embed, stride, dist, ff, embed, stride, dist, isign, my_flags, nt_all) 1589 1590 ! Now perform the 3D FFT via FFTW. 1591 call dfftw_execute_dft(my_plan, ff, ff) 1592 1593 call fftw3_destroy_plan(my_plan) 1594 1595 ! -1, FFTW returns not normalized FTs 1596 if (isign == ABI_FFTW_FORWARD .and. iscale /= 0) then 1597 call ZDSCAL(ldx*ldy*ldz*ndat, one/(nx*ny*nz), ff, 1) 1598 end if 1599 1600 #else 1601 ABI_ERROR("FFTW3 support not activated") 1602 ABI_UNUSED((/nx,ny,nz,ldx,ldy,ldz,isign/)) 1603 ABI_UNUSED(ff) 1604 if (PRESENT(fftw_flags)) then 1605 ABI_UNUSED(fftw_flags) 1606 end if 1607 #endif 1608 1609 end subroutine fftw3_c2c_ip_dpc
m_fftw3/fftw3_c2c_ip_spc [ Functions ]
[ Top ] [ m_fftw3 ] [ Functions ]
NAME
fftw3_c2c_ip_spc
FUNCTION
Driver routine for in-place 3D complex-complex FFT. TARGET: Simple precision complex arrays.
INPUTS
nx,ny,nz=Number of points along the three directions. ldx,ldy,ldz=Physical dimensions of the array. ndat=Number of FFTs to be done. iscale=0 if G --> R FFT should not be scaled. isign= +1 : ff(G) => ff(R); -1 : ff(R) => ff(G) [fftw_flags]=Flags used to create the plan. They can be combined with the "+" operator. Defaults to ABI_FFTW_ESTIMATE.
SIDE EFFECTS
ff(ldx*ldy*ldz*ndat)= In input: the complex array to be transformed. In output: the Fourier transformed in the space specified by isign.
SOURCE
1426 subroutine fftw3_c2c_ip_spc(nx, ny, nz, ldx, ldy, ldz, ndat, iscale, isign, ff, fftw_flags) 1427 1428 !Arguments ------------------------------------ 1429 !scalars 1430 integer,intent(in) :: nx,ny,nz,ldx,ldy,ldz,ndat,iscale,isign 1431 integer,optional,intent(in) :: fftw_flags 1432 !arrays 1433 complex(spc),intent(inout) :: ff(ldx*ldy*ldz*ndat) 1434 1435 #ifdef HAVE_FFTW3 1436 !Local variables------------------------------- 1437 !scalars 1438 integer,parameter :: rank3=3,nt_all=-1 1439 integer :: my_flags,dist,stride 1440 integer(KIND_FFTW_PLAN) :: my_plan 1441 !arrays 1442 integer :: embed(rank3),n(rank3) 1443 1444 ! ************************************************************************* 1445 1446 my_flags=ABI_FFTW_ESTIMATE; if (PRESENT(fftw_flags)) my_flags=fftw_flags 1447 1448 stride = 1 1449 dist = ldx*ldy*ldz 1450 embed = [ldx, ldy, ldz] 1451 n = [nx, ny, nz] 1452 1453 my_plan = fftw3_plan_many_dft(rank3, n, ndat, ff, embed, stride, dist, ff, embed, stride, dist, isign, my_flags, nt_all) 1454 1455 ! Now perform the 3D FFT via FFTW. 1456 call sfftw_execute_dft(my_plan, ff, ff) 1457 1458 call fftw3_destroy_plan(my_plan) 1459 1460 if (isign == ABI_FFTW_FORWARD .and. iscale /= 0) then ! -1, FFTW returns not normalized FTs 1461 call xscal(ldx*ldy*ldz*ndat, REAL(one/(nx*ny*nz),KIND=sp), ff, 1) 1462 end if 1463 1464 #else 1465 ABI_ERROR("FFTW3 support not activated") 1466 ABI_UNUSED((/nx,ny,nz,ldx,ldy,ldz,isign/)) 1467 ABI_UNUSED(ff) 1468 if (PRESENT(fftw_flags)) then 1469 ABI_UNUSED(fftw_flags) 1470 end if 1471 #endif 1472 1473 end subroutine fftw3_c2c_ip_spc
m_fftw3/fftw3_c2c_op_dpc [ Functions ]
[ Top ] [ m_fftw3 ] [ Functions ]
NAME
fftw3_c2c_op_dpc
FUNCTION
Driver routine for out-of-place 3D complex-complex FFT of lengths nx, ny, nz. TARGET: single precision complex arrays
INPUTS
nx,ny,nz=Number of points along the three directions. ldx,ldy,ldz=Physical dimensions of the array. ndat=Number of FFTs to be done. iscale=0 if G --> R FFT should not be scaled. isign= +1 : ff(G) => gg(R); -1 : ff(R) => gg(G) ff(ldx*ldy*ldz*ndat)=The array to be transformed. [fftw_flags]=Flags used to create the plan. They can be combined with the "+" operator. Defaults to ABI_FFTW_ESTIMATE.
OUTPUT
gg(ldx*ldy*ldz*ndat)=The FFT of ff.
SOURCE
1714 subroutine fftw3_c2c_op_dpc(nx, ny, nz, ldx, ldy, ldz, ndat, iscale, isign, ff, gg, fftw_flags) 1715 1716 !Arguments ------------------------------------ 1717 !scalars 1718 integer,intent(in) :: nx,ny,nz,ldx,ldy,ldz,isign,ndat,iscale 1719 integer,optional,intent(in) :: fftw_flags 1720 !arrays 1721 complex(dpc),intent(in) :: ff(ldx*ldy*ldz*ndat) 1722 complex(dpc),intent(out) :: gg(ldx*ldy*ldz*ndat) 1723 1724 #ifdef HAVE_FFTW3 1725 !Local variables------------------------------- 1726 !scalars 1727 integer,parameter :: rank3=3,nt_all=-1 1728 integer :: my_flags,dist,stride 1729 integer(KIND_FFTW_PLAN) :: my_plan 1730 !arrays 1731 integer :: embed(rank3),n(rank3) 1732 1733 ! ************************************************************************* 1734 1735 my_flags=ABI_FFTW_ESTIMATE; if (PRESENT(fftw_flags)) my_flags= fftw_flags 1736 1737 stride = 1 1738 dist = ldx*ldy*ldz 1739 embed = [ldx, ldy, ldz] 1740 n = [nx, ny, nz] 1741 1742 my_plan = fftw3_plan_many_dft(rank3, n, ndat, ff, embed, stride, dist, gg, embed, stride, dist, isign, my_flags, nt_all) 1743 1744 ! Now perform the 3D FFT via FFTW. 1745 call dfftw_execute_dft(my_plan, ff, gg) 1746 1747 call fftw3_destroy_plan(my_plan) 1748 1749 if (isign == ABI_FFTW_FORWARD .and. iscale /= 0) then ! -1, FFTW returns not normalized FTs 1750 call xscal(ldx*ldy*ldz*ndat, one/(nx*ny*nz), gg, 1) 1751 end if 1752 1753 #else 1754 ABI_ERROR("FFTW3 support not activated") 1755 ABI_UNUSED((/nx,ny,nz,ldx,ldy,ldz,isign/)) 1756 ABI_UNUSED(ff) 1757 ABI_UNUSED(gg) 1758 if (PRESENT(fftw_flags)) then 1759 ABI_UNUSED(fftw_flags) 1760 end if 1761 #endif 1762 1763 end subroutine fftw3_c2c_op_dpc
m_fftw3/fftw3_c2c_op_spc [ Functions ]
[ Top ] [ m_fftw3 ] [ Functions ]
NAME
fftw3_c2c_op_spc
FUNCTION
Driver routine for out-of-place 3D complex-complex FFT of lengths nx, ny, nz. TARGET: single precision complex arrays
INPUTS
nx,ny,nz=Number of points along the three directions. ldx,ldy,ldz=Physical dimensions of the array. ndat=Number of FFTs to be done. iscale=0 if G --> R FFT should not be scaled. isign= +1 : ff(G) => gg(R); -1 : ff(R) => gg(G) ff(ldx*ldy*ldz*ndat)=The array to be transformed. [fftw_flags]=Flags used to create the plan. They can be combined with the "+" operator. Defaults to ABI_FFTW_ESTIMATE.
OUTPUT
gg(ldx*ldy*ldz*ndat)=The FFT of ff.
SOURCE
1637 subroutine fftw3_c2c_op_spc(nx, ny, nz, ldx, ldy, ldz, ndat, iscale, isign, ff, gg, fftw_flags) 1638 1639 !Arguments ------------------------------------ 1640 !scalars 1641 integer,intent(in) :: nx,ny,nz,ldx,ldy,ldz,iscale,isign,ndat 1642 integer,optional,intent(in) :: fftw_flags 1643 !arrays 1644 complex(spc),intent(in) :: ff(ldx*ldy*ldz*ndat) 1645 complex(spc),intent(out) :: gg(ldx*ldy*ldz*ndat) 1646 1647 #ifdef HAVE_FFTW3 1648 !Local variables------------------------------- 1649 !scalars 1650 integer,parameter :: rank3=3,nt_all=-1 1651 integer :: my_flags,dist,stride 1652 integer(KIND_FFTW_PLAN) :: my_plan 1653 !arrays 1654 integer :: embed(rank3),n(rank3) 1655 1656 ! ************************************************************************* 1657 1658 my_flags=ABI_FFTW_ESTIMATE; if (PRESENT(fftw_flags)) my_flags= fftw_flags 1659 1660 stride = 1 1661 dist = ldx*ldy*ldz 1662 embed = [ldx, ldy, ldz] 1663 n = [nx, ny, nz] 1664 1665 my_plan = fftw3_plan_many_dft(rank3, n, ndat, ff, embed, stride, dist, gg, embed, stride, dist, isign, my_flags, nt_all) 1666 1667 ! Now perform the 3D FFT via FFTW. 1668 call sfftw_execute_dft(my_plan, ff, gg) 1669 1670 call fftw3_destroy_plan(my_plan) 1671 1672 if (isign == ABI_FFTW_FORWARD .and. iscale /= 0) then ! -1, FFTW returns not normalized FTs 1673 call xscal(ldx*ldy*ldz*ndat, REAL(one/(nx*ny*nz), KIND=sp), gg, 1) 1674 end if 1675 1676 #else 1677 ABI_ERROR("FFTW3 support not activated") 1678 ABI_UNUSED((/nx,ny,nz,ldx,ldy,ldz,isign/)) 1679 ABI_UNUSED(ff) 1680 ABI_UNUSED(gg) 1681 if (PRESENT(fftw_flags)) then 1682 ABI_UNUSED(fftw_flags) 1683 end if 1684 #endif 1685 1686 end subroutine fftw3_c2c_op_spc
m_fftw3/fftw3_c2r_op [ Functions ]
[ Top ] [ m_fftw3 ] [ Functions ]
NAME
fftw3_c2r_op
FUNCTION
Driver routine for out-of-place 3D complex-to-real FFT of lengths nx, ny, nz.
INPUTS
nx,ny,nz=Number of point along the three directions. ldx,ldy,ldz=Physical dimension of the f array (to avoid cache conflicts). ndat=Number of FFTs to be done. ff(2*ldx*ldy*ldz*ndat)=The complex array to be transformed. [fftw_flags]=Flags used to create the plan. They can be combined with the "+" operator. Defaults to ABI_FFTW_ESTIMATE.
OUTPUT
gg(ldx*ldy*ldz*ndat)=The backwards real FFT of ff.
NOTES
FIXME For the time-being. No augmentation of the mesh to reduce memory conflicts, as MKL crashes if the advanced interface is used.
SOURCE
1943 subroutine fftw3_c2r_op(nx,ny,nz,ldx,ldy,ldz,ndat,ff,gg,fftw_flags) 1944 1945 !Arguments ------------------------------------ 1946 !scalars 1947 integer,intent(in) :: nx,ny,nz,ldx,ldy,ldz,ndat 1948 integer,optional,intent(in) :: fftw_flags 1949 !arrays 1950 real(dp),intent(in) :: ff(2,ldx*ldy*ldz*ndat) 1951 real(dp),intent(out) :: gg(ldx*ldy*ldz*ndat) 1952 1953 #ifdef HAVE_FFTW3 1954 !Local variables------------------------------- 1955 !scalars 1956 integer,parameter :: rank3=3,nt_all=-1 1957 integer :: nhp,my_flags,padx,i2,i3,igp,igf,idat,padatf,padatp,idist,odist,stride 1958 integer(KIND_FFTW_PLAN) :: my_plan 1959 !arrays 1960 integer :: inembed(rank3),onembed(rank3),n(rank3) 1961 real(dp),allocatable :: ff_hp(:,:) 1962 1963 ! ************************************************************************* 1964 1965 #ifdef DEV_RC_BUG 1966 if (ANY( (/nx,ny,nz/) /= (/ldx,ldy,ldz/) )) then 1967 ABI_ERROR("Augmentation not supported") 1968 end if 1969 #endif 1970 1971 my_flags=ABI_FFTW_ESTIMATE; if (PRESENT(fftw_flags)) my_flags= fftw_flags 1972 1973 stride = 1 1974 nhp = (nx/2+1)*ny*nz 1975 idist = nhp 1976 odist = ldx*ldy*ldz 1977 n = (/nx,ny,nz/) 1978 inembed = (/(nx/2+1),ny,nz/) 1979 onembed = (/ldx,ldy,ldz/) 1980 1981 ! Fill the Hermitian part: Hermitian redundancy: out[i] is the conjugate of out[n-i] 1982 ABI_MALLOC(ff_hp,(2,nhp*ndat)) 1983 1984 padx = (nx/2+1) 1985 do idat=1,ndat 1986 padatf=(idat-1)*ldx*ldy*ldz 1987 padatp=(idat-1)*padx*ny*nz 1988 !$OMP PARALLEL DO PRIVATE(igf,igp) 1989 do i3=1,nz 1990 do i2=1,ny 1991 igf = (i3-1)*ldx*ldy + (i2-1)*ldx + padatf 1992 igp = (i3-1)*padx*ny + (i2-1)*padx + padatp 1993 ff_hp(:,igp+1:igp+padx) = ff(:,igf+1:igf+padx) 1994 end do 1995 end do 1996 end do 1997 1998 ! NOTE: The c2r transform destroys its input array even for out-of-place transforms. 1999 #ifdef DEV_RC_BUG 2000 if (ndat/=1) ABI_ERROR("ndat/=1 + MKL not coded") 2001 call dfftw_plan_dft_c2r_3d(my_plan, nx, ny, nz, ff_hp, gg, my_flags) 2002 if (my_plan==NULL_PLAN) then 2003 ABI_ERROR("dfftw_plan_dft_c2r_3d returned NULL_PLAN") 2004 end if 2005 #else 2006 my_plan = dplan_many_dft_c2r(rank3, n, ndat, ff_hp, inembed, stride, idist, gg, onembed, stride, odist, my_flags, nt_all) 2007 #endif 2008 2009 ! Now perform the 3D FFT via FFTW. c2r are always ABI_FFTW_BACKWARD 2010 call dfftw_execute_dft_c2r(my_plan, ff_hp, gg) 2011 2012 call fftw3_destroy_plan(my_plan) 2013 ABI_FREE(ff_hp) 2014 2015 #else 2016 ABI_ERROR("FFTW3 support not activated") 2017 ABI_UNUSED((/nx,ny,nz,ldx,ldy,ldz/)) 2018 ABI_UNUSED(ff(1,1)) 2019 ABI_UNUSED(gg(1)) 2020 if (PRESENT(fftw_flags)) then 2021 ABI_UNUSED(fftw_flags) 2022 end if 2023 #endif 2024 2025 end subroutine fftw3_c2r_op
m_fftw3/fftw3_cleanup [ Functions ]
[ Top ] [ m_fftw3 ] [ Functions ]
NAME
fftw3_cleanup
FUNCTION
Reset FFTW to the pristine state it was in when you started your program, All existing plans become undefined.
NOTES
FFTW planner saves some other persistent data, such as the accumulated wisdom and a list of algorithms available in the current configuration. If you want to deallocate all of that and reset FFTW to the pristine state it was in when you started your program, you can call fftw3_cleanup(); After calling fftw3_cleanup, all existing plans become undefined, and you should not attempt to execute them nor to destroy them. You can however create and execute/destroy new plans, in which case FFTW starts accumulating wisdom information again. fftw3_cleanup does not deallocate your plans, however. To prevent memory leaks, you must still call fftw_destroy_plan before executing fftw3_cleanup
SOURCE
2204 subroutine fftw3_cleanup() 2205 2206 ! ************************************************************************* 2207 2208 #ifdef HAVE_FFTW3_MPI 2209 call fftw_mpi_cleanup() 2210 #endif 2211 #ifdef HAVE_FFTW3_THREADS 2212 if (THREADS_INITED==1) then 2213 call dfftw_cleanup_threads() 2214 THREADS_INITED = 0 2215 end if 2216 #elif defined HAVE_FFTW3 2217 call dfftw_cleanup() 2218 #else 2219 ABI_ERROR("FFTW3 support not activated") 2220 #endif 2221 2222 end subroutine fftw3_cleanup
m_fftw3/fftw3_destroy_plan [ Functions ]
[ Top ] [ m_fftw3 ] [ Functions ]
NAME
fftw3_destroy_plan
FUNCTION
Release the memory allocate for the plan.
INPUTS
SOURCE
2238 subroutine fftw3_destroy_plan(plan) 2239 2240 !Arguments ------------------------------------ 2241 !scalars 2242 integer(KIND_FFTW_PLAN),intent(in) :: plan 2243 2244 ! ************************************************************************* 2245 2246 #ifdef HAVE_FFTW3 2247 !$OMP CRITICAL (OMPC_fftw3_destroy_plan) 2248 call dfftw_destroy_plan(plan) 2249 !$OMP END CRITICAL (OMPC_fftw3_destroy_plan) 2250 2251 #else 2252 if (.FALSE.) write(std_out,*)plan 2253 #endif 2254 2255 end subroutine fftw3_destroy_plan
m_fftw3/fftw3_execute_dft_dp [ Functions ]
[ Top ] [ m_fftw3 ] [ Functions ]
NAME
fftw3_execute_dft_dp
FUNCTION
INPUTS
OUTPUT
NOTES
This interface is used to perform complex to complex FFT with real arrays containing the real and imaginary part. I have to admit that this interface is a bit ambiguous since FFTW3 provides routines for real-to-real transforms.
SOURCE
2820 #ifdef HAVE_FFTW3 2821 2822 subroutine fftw3_execute_dft_dp(plan, in, out) 2823 2824 !Arguments ------------------------------------ 2825 !scalars 2826 integer(KIND_FFTW_PLAN),intent(in) :: plan 2827 real(C_DOUBLE),intent(inout) :: in(*) 2828 real(C_DOUBLE),intent(out) :: out(*) 2829 2830 ! ************************************************************************* 2831 2832 call dfftw_execute_dft(plan, in, out) 2833 2834 end subroutine fftw3_execute_dft_dp
m_fftw3/fftw3_execute_dft_dpc [ Functions ]
[ Top ] [ m_fftw3 ] [ Functions ]
NAME
fftw3_execute_dft_dpc
SOURCE
2874 #ifdef HAVE_FFTW3 2875 2876 subroutine fftw3_execute_dft_dpc(plan, in, out) 2877 2878 !Arguments ------------------------------------ 2879 !scalars 2880 integer(KIND_FFTW_PLAN),intent(in) :: plan 2881 complex(C_DOUBLE_COMPLEX),intent(inout) :: in(*) 2882 complex(C_DOUBLE_COMPLEX),intent(out) :: out(*) 2883 2884 ! ************************************************************************* 2885 2886 call dfftw_execute_dft(plan, in, out) 2887 2888 end subroutine fftw3_execute_dft_dpc
m_fftw3/fftw3_execute_dft_spc [ Functions ]
[ Top ] [ m_fftw3 ] [ Functions ]
NAME
fftw3_execute_dft_spc
SOURCE
2847 #ifdef HAVE_FFTW3 2848 2849 subroutine fftw3_execute_dft_spc(plan, in, out) 2850 2851 !Arguments ------------------------------------ 2852 !scalars 2853 integer(KIND_FFTW_PLAN),intent(in) :: plan 2854 complex(C_FLOAT_COMPLEX),intent(inout) :: in(*) 2855 complex(C_FLOAT_COMPLEX),intent(out) :: out(*) 2856 2857 ! ************************************************************************* 2858 2859 call sfftw_execute_dft(plan, in, out) 2860 2861 end subroutine fftw3_execute_dft_spc
m_fftw3/fftw3_fftpad_dp [ Functions ]
[ Top ] [ m_fftw3 ] [ Functions ]
NAME
fftw3_fftpad_dp
FUNCTION
This routine transforms wavefunctions using 3D zero-padded FFTs with FFTW3. The 3D ffts are computed only on lines and planes which have non zero elements. These lines and planes are defined by the two vectors do_fft_x(ldy*nz) and do_fft_y(nz) FFT transform is in-place.
INPUTS
nx,ny,nz=Logical dimensions of the FFT mesh. ldx,ldy,ldz=Physical dimension of the f array (to avoid cache conflicts). ndat=Number of FFT transforms. mgfft=MAX(nx,ny,nz), only used to dimension gbound isign=The sign of the transform. gbound(2*mgfft+8,2)= The boundaries of the basis sphere of G vectors at a given k-point. See sphereboundary for more info.
SIDE EFFECTS
ff(2*ldx*ldy*ldz*ndat)= input: The array with the data to be transformed. output: The results of the FFT.
SOURCE
2406 subroutine fftw3_fftpad_dp(ff, nx, ny, nz, ldx, ldy, ldz, ndat, mgfft, isign, gbound, iscale) 2407 2408 !Arguments ------------------------------------ 2409 !scalars 2410 integer,intent(in) :: nx,ny,nz,ldx,ldy,ldz,ndat,mgfft,isign 2411 !arrays 2412 integer,intent(in) :: gbound(2*mgfft+8,2) 2413 real(dp),intent(inout) :: ff(2*ldx*ldy*ldz*ndat) 2414 integer,optional,intent(in) :: iscale 2415 2416 !Local variables------------------------------- 2417 !scalars 2418 #ifdef HAVE_FFTW3 2419 integer,parameter :: dst=2 2420 integer :: iscale__ 2421 real(dp) :: fact 2422 2423 ! ************************************************************************* 2424 2425 iscale__ = merge(1, 0, isign == -1); if (present(iscale)) iscale__ = iscale 2426 2427 #include "fftw3_fftpad.finc" 2428 2429 #else 2430 ABI_ERROR("FFTW3 support not activated") 2431 ABI_UNUSED((/nx,ny,nz,ldx,ldy,ldz,mgfft,isign/)) 2432 ABI_UNUSED(gbound(1,1)) 2433 ABI_UNUSED(ff(1)) 2434 #endif 2435 2436 end subroutine fftw3_fftpad_dp
m_fftw3/fftw3_fftpad_dpc [ Functions ]
[ Top ] [ m_fftw3 ] [ Functions ]
NAME
fftw3_fftpad_dpc
FUNCTION
This routine transforms wavefunctions using 3D zero-padded FFTs with FFTW3. The 3D ffts are computed only on lines and planes which have non zero elements. These lines and planes are defined by the two vectors do_fft_x(ldy*nz) and do_fft_y(nz) FFT transform is in-place. Target: complex arrays.
INPUTS
nx,ny,nz=Logical dimensions of the FFT mesh. ldx,ldy,ldz=Physical dimension of the f array (to avoid cache conflicts). ndat=Number of FFT transforms. mgfft=MAX(nx,ny,nz), only used to dimension gbound. isign=The sign of the transform. gbound(2*mgfft+8,2)= The boundaries of the basis sphere of G vectors at a given k-point. See sphereboundary for more info.
SIDE EFFECTS
ff(ldx*ldy*ldz*ndat)= input: The array with the data to be transformed. output: The results of the FFT.
SOURCE
2467 subroutine fftw3_fftpad_dpc(ff, nx, ny, nz, ldx, ldy, ldz, ndat, mgfft, isign, gbound, iscale) 2468 2469 !Arguments ------------------------------------ 2470 !scalars 2471 integer,intent(in) :: nx,ny,nz,ldx,ldy,ldz,ndat,mgfft,isign 2472 integer,optional,intent(in) :: iscale 2473 !arrays 2474 integer,intent(in) :: gbound(2*mgfft+8,2) 2475 complex(dpc),intent(inout) :: ff(ldx*ldy*ldz*ndat) 2476 2477 #ifdef HAVE_FFTW3 2478 !Local variables------------------------------- 2479 integer,parameter :: dst=1 2480 integer :: iscale__ 2481 real(dp) :: fact 2482 2483 ! ************************************************************************* 2484 2485 iscale__ = merge(1, 0, isign == -1); if (present(iscale)) iscale__ = iscale 2486 2487 #include "fftw3_fftpad.finc" 2488 2489 #else 2490 ABI_ERROR("FFTW3 support not activated") 2491 ABI_UNUSED((/nx,ny,nz,ldx,ldy,ldz,ndat,mgfft,isign/)) 2492 ABI_UNUSED(gbound(1,1)) 2493 ABI_UNUSED(ff(1)) 2494 #endif 2495 2496 end subroutine fftw3_fftpad_dpc
m_fftw3/fftw3_fftpad_spc [ Functions ]
[ Top ] [ m_fftw3 ] [ Functions ]
NAME
fftw3_fftpad_spc
FUNCTION
This routine transforms wavefunctions using 3D zero-padded FFTs with FFTW3. The 3D ffts are computed only on lines and planes which have non zero elements. These lines and planes are defined by the two vectors do_fft_x(ldy*nz) and do_fft_y(nz) FFT transform is in-place. Target: complex arrays.
INPUTS
nx,ny,nz=Logical dimensions of the FFT mesh. ldx,ldy,ldz=Physical dimension of the f array (to avoid cache conflicts). ndat=Number of FFT transforms. mgfft=MAX(nx,ny,nz), only used to dimension gbound. isign=The sign of the transform. gbound(2*mgfft+8,2)= The boundaries of the basis sphere of G vectors at a given k-point. See sphereboundary for more info.
SIDE EFFECTS
ff(ldx*ldy*ldz*ndat)= input: The array with the data to be transformed. output: The results of the FFT.
SOURCE
1504 subroutine fftw3_fftpad_spc(ff, nx, ny, nz, ldx, ldy, ldz, ndat, mgfft, isign, gbound, iscale) 1505 1506 !Arguments ------------------------------------ 1507 !scalars 1508 integer,intent(in) :: nx,ny,nz,ldx,ldy,ldz,ndat,mgfft,isign 1509 !arrays 1510 integer,intent(in) :: gbound(2*mgfft+8,2) 1511 complex(spc),intent(inout) :: ff(ldx*ldy*ldz*ndat) 1512 integer,optional,intent(in) :: iscale 1513 1514 #ifdef HAVE_FFTW3 1515 !Local variables------------------------------- 1516 integer,parameter :: dst=1 1517 integer :: iscale__ 1518 real(sp) :: fact 1519 1520 ! ************************************************************************* 1521 1522 iscale__ = merge(1, 0, isign == -1); if (present(iscale)) iscale__ = iscale 1523 1524 #include "fftw3_fftpad.finc" 1525 1526 #else 1527 ABI_ERROR("FFTW3 support not activated") 1528 ABI_UNUSED((/nx,ny,nz,ldx,ldy,ldz,ndat,mgfft,isign/)) 1529 ABI_UNUSED(gbound(1,1)) 1530 ABI_UNUSED(ff(1)) 1531 #endif 1532 1533 end subroutine fftw3_fftpad_spc
m_fftw3/fftw3_fftrisc_dp [ Functions ]
[ Top ] [ m_fftw3 ] [ Functions ]
NAME
fftw3_fftrisc_dp
FUNCTION
Carry out Fourier transforms between real and reciprocal (G) space, for wavefunctions, contained in a sphere in reciprocal space, in both directions. Also accomplish some post-processing.
NOTES
Specifically uses rather sophisticated algorithms, based on S Goedecker routines, specialized for superscalar RISC architecture. Zero padding : saves 7/12 execution time Bi-dimensional data locality in most of the routine : cache reuse For k-point (0 0 0) : takes advantage of symmetry of data. Note however that no blocking is used, in both 1D z-transform or subsequent 2D transform. This should be improved.
INPUTS
cplex= if 1 , denpot is real, if 2 , denpot is complex (cplex=2 only allowed for option=2 when istwf_k=1) one can also use cplex=0 if option=0 or option=3 fofgin(2,npwin)=holds input wavefunction in G vector basis sphere. gboundin(2*mgfft+8,2)=sphere boundary info for reciprocal to real space gboundout(2*mgfft+8,2)=sphere boundary info for real to reciprocal space istwf_k=option parameter that describes the storage of wfs kg_kin(3,npwin)=reduced planewave coordinates, input kg_kout(3,npwout)=reduced planewave coordinates, output mgfft=maximum size of 1D FFTs ngfft(18)=contain all needed information about 3D FFT, see ~abinit/doc/variables/vargs.htm#ngfft npwin=number of elements in fofgin array (for option 0, 1 and 2) npwout=number of elements in fofgout array (for option 2 and 3) ldx,ldy,ldz=ngfft(4),ngfft(5),ngfft(6), dimensions of fofr. option= if 0: do direct FFT if 1: do direct FFT, then sum the density if 2: do direct FFT, multiply by the potential, then do reverse FFT if 3: do reverse FFT only weight=weight to be used for the accumulation of the density in real space (needed only when option=1)
OUTPUT
(see side effects)
OPTIONS
The different options are: - reciprocal to real space and output the result (when option=0), - reciprocal to real space and accumulate the density (when option=1) or - reciprocal to real space, apply the local potential to the wavefunction in real space and produce the result in reciprocal space (when option=2) - real space to reciprocal space (when option=3). option=0 IS NOT ALLOWED when istwf_k>2 option=3 IS NOT ALLOWED when istwf_k>=2
SIDE EFFECTS
for option==0, fofgin(2,npwin)=holds input wavefunction in G sphere; fofr(2,ldx,ldy,ldz) contains the Fourier Transform of fofgin; no use of denpot, fofgout and npwout. for option==1, fofgin(2,npwin)=holds input wavefunction in G sphere; denpot(cplex*ldx,ldy,ldz) contains the input density at input, and the updated density at output; no use of fofgout and npwout. for option==2, fofgin(2,npwin)=holds input wavefunction in G sphere; denpot(cplex*ldx,ldy,ldz) contains the input local potential; fofgout(2,npwout) contains the output function; for option==3, fofr(2,ldx,ldy,ldz) contains the real space wavefunction; fofgout(2,npwout) contains its Fourier transform; no use of fofgin and npwin.
SOURCE
892 subroutine fftw3_fftrisc_dp(cplex,denpot,fofgin,fofgout,fofr,gboundin,gboundout,istwf_k,kg_kin,kg_kout,& 893 mgfft,ngfft,npwin,npwout,ldx,ldy,ldz,option, & 894 weight_r, weight_i, abi_convention, iscale) 895 896 !Arguments ------------------------------------ 897 !scalars 898 integer,intent(in) :: cplex,istwf_k,mgfft,ldx,ldy,ldz,npwin,npwout,option 899 real(dp),intent(in) :: weight_r,weight_i 900 !arrays 901 integer,intent(in) :: gboundin(2*mgfft+8,2),gboundout(2*mgfft+8,2) 902 integer,intent(in) :: kg_kin(3,npwin),kg_kout(3,npwout),ngfft(18) 903 real(dp),intent(in) :: fofgin(2,npwin) 904 real(dp),intent(inout) :: denpot(cplex*ldx,ldy,ldz),fofr(2,ldx*ldy*ldz) 905 real(dp),intent(inout) :: fofgout(2,npwout) 906 logical,optional,intent(in) :: abi_convention 907 integer,optional,intent(in) :: iscale 908 909 ! ************************************************************************* 910 911 #ifdef HAVE_FFTW3 912 913 #undef FFT_PRECISION 914 #undef MYKIND 915 #undef MYCZERO 916 #undef MYCMPLX 917 #undef MYCONJG 918 919 #define FFT_PRECISION FFT_DOUBLE 920 #define MYKIND DPC 921 #define MYCZERO (0._dp,0._dp) 922 #define MYCMPLX DCMPLX 923 #define MYCONJG DCONJG 924 925 #include "fftw3_fftrisc.finc" 926 927 #else 928 ABI_ERROR("FFTW3 support not activated") 929 ABI_UNUSED((/cplex,gboundin(1,1),gboundout(1,1),istwf_k,kg_kin(1,1),kg_kout(1,1)/)) 930 ABI_UNUSED((/mgfft,ngfft(1),npwin,npwout,ldx,ldy,ldz,option/)) 931 ABI_UNUSED((/denpot(1,1,1),fofgin(1,1),fofgout(1,1),fofr(1,1),weight_r,weight_i/)) 932 #endif 933 934 end subroutine fftw3_fftrisc_dp
m_fftw3/fftw3_fftrisc_mixprec [ Functions ]
[ Top ] [ m_fftw3 ] [ Functions ]
NAME
fftw3_fftrisc_mixprec
FUNCTION
Mixed precision version of fftrisc: input/output in dp, computation done in sp. See fftw3_fftrisc_dp for API docs.
SOURCE
949 subroutine fftw3_fftrisc_mixprec(cplex,denpot,fofgin,fofgout,fofr,gboundin,gboundout,istwf_k,kg_kin,kg_kout,& 950 mgfft,ngfft,npwin,npwout,ldx,ldy,ldz,option, & 951 weight_r,weight_i, abi_convention, iscale) ! optional 952 953 !Arguments ------------------------------------ 954 !scalars 955 integer,intent(in) :: cplex,istwf_k,mgfft,ldx,ldy,ldz,npwin,npwout,option 956 real(dp),intent(in) :: weight_r,weight_i 957 !arrays 958 integer,intent(in) :: gboundin(2*mgfft+8,2),gboundout(2*mgfft+8,2) 959 integer,intent(in) :: kg_kin(3,npwin),kg_kout(3,npwout),ngfft(18) 960 real(dp),intent(in) :: fofgin(2,npwin) 961 real(dp),intent(inout) :: denpot(cplex*ldx,ldy,ldz),fofr(2,ldx*ldy*ldz) 962 real(dp),intent(inout) :: fofgout(2,npwout) 963 logical,optional,intent(in) :: abi_convention 964 integer,optional,intent(in) :: iscale 965 966 ! ************************************************************************* 967 968 #ifdef HAVE_FFTW3 969 970 #undef FFT_PRECISION 971 #undef MYKIND 972 #undef MYCZERO 973 #undef MYCMPLX 974 #undef MYCONJG 975 976 #define FFT_PRECISION FFT_MIXPREC 977 #define MYKIND SPC 978 #define MYCZERO (0._sp,0._sp) 979 #define MYCMPLX CMPLX 980 #define MYCONJG CONJG 981 982 #include "fftw3_fftrisc.finc" 983 984 #else 985 ABI_ERROR("FFTW3 support not activated") 986 ABI_UNUSED((/cplex,gboundin(1,1),gboundout(1,1),istwf_k,kg_kin(1,1),kg_kout(1,1)/)) 987 ABI_UNUSED((/mgfft,ngfft(1),npwin,npwout,ldx,ldy,ldz,option/)) 988 ABI_UNUSED((/denpot(1,1,1),fofgin(1,1),fofgout(1,1),fofr(1,1),weight_r,weight_i/)) 989 #endif 990 991 end subroutine fftw3_fftrisc_mixprec
m_fftw3/fftw3_fftrisc_sp [ Functions ]
[ Top ] [ m_fftw3 ] [ Functions ]
NAME
fftw3_fftrisc_sp
FUNCTION
Carry out Fourier transforms between real and reciprocal (G) space, for wavefunctions, contained in a sphere in reciprocal space, in both directions. Also accomplish some post-processing. See fftw3_fftrisc_dp for API doc.
SOURCE
773 subroutine fftw3_fftrisc_sp(cplex,denpot,fofgin,fofgout,fofr,gboundin,gboundout,istwf_k,kg_kin,kg_kout,& 774 mgfft,ngfft,npwin,npwout,ldx,ldy,ldz,option, & 775 weight_r,weight_i, abi_convention, iscale) 776 777 !Arguments ------------------------------------ 778 !scalars 779 integer,intent(in) :: cplex,istwf_k,mgfft,ldx,ldy,ldz,npwin,npwout,option 780 real(dp),intent(in) :: weight_i,weight_r 781 !arrays 782 integer,intent(in) :: gboundin(2*mgfft+8,2),gboundout(2*mgfft+8,2) 783 integer,intent(in) :: kg_kin(3,npwin),kg_kout(3,npwout),ngfft(18) 784 real(sp),intent(in) :: fofgin(2,npwin) 785 real(dp),intent(inout) :: denpot(cplex*ldx,ldy,ldz) 786 real(sp),intent(inout) :: fofr(2,ldx*ldy*ldz) 787 real(sp),intent(inout) :: fofgout(2,npwout) 788 logical,optional,intent(in) :: abi_convention 789 integer,optional,intent(in) :: iscale 790 791 ! ************************************************************************* 792 793 #ifdef HAVE_FFTW3 794 795 #undef FFT_PRECISION 796 #undef MYKIND 797 #undef MYCZERO 798 #undef MYCMPLX 799 #undef MYCONJG 800 801 #define FFT_PRECISION FFT_SINGLE 802 #define MYKIND SPC 803 #define MYCZERO (0._sp,0._sp) 804 #define MYCMPLX CMPLX 805 #define MYCONJG CONJG 806 807 #include "fftw3_fftrisc.finc" 808 809 #else 810 ABI_ERROR("FFTW3 support not activated") 811 ABI_UNUSED((/cplex,gboundin(1,1),gboundout(1,1),istwf_k,kg_kin(1,1),kg_kout(1,1)/)) 812 ABI_UNUSED((/mgfft,ngfft(1),npwin,npwout,ldx,ldy,ldz,option/)) 813 ABI_UNUSED((/denpot(1,1,1),weight_r,weight_i/)) 814 ABI_UNUSED((/fofgin(1,1),fofgout(1,1),fofr(1,1)/)) 815 #endif 816 817 end subroutine fftw3_fftrisc_sp
m_fftw3/fftw3_fftug_dp [ Functions ]
[ Top ] [ m_fftw3 ] [ Functions ]
NAME
fftw3_fftug_dp
FUNCTION
Compute ndat zero-padded FFTs from G to R space. Mainly used for the transform of wavefunctions. TARGET: dp arrays with real and imaginary part
INPUTS
fftalg=FFT algorith (see input variable) fftcache=size of the cache (kB) npw_k=number of plane waves for this k-point. nx,ny,nz=Number of point along the three directions. ldx,ldy,ldz=Leading dimensions of the array. ndat=Number of transforms istwf_k=Option describing the storage of the wavefunction. mgfft=Max number of FFT divisions (used to dimension gbound) kg_k(3,npw_k)=G-vectors in reduced coordinates gbound(2*mgfft+8,2)=Table for zero-padded FFT. See sphereboundary. ug(npw_k*ndat)=wavefunctions in reciprocal space.
OUTPUT
ur(ldx*ldy*ldz*ndat)=wavefunctions in real space.
SOURCE
1023 subroutine fftw3_fftug_dp(fftalg, fftcache, npw_k, nx, ny, nz, ldx, ldy, ldz, ndat, & 1024 istwf_k, mgfft, kg_k,gbound, ug, ur, & 1025 isign, iscale) ! optional 1026 1027 !Arguments ------------------------------------ 1028 !scalars 1029 integer,intent(in) :: fftalg,fftcache 1030 integer,intent(in) :: npw_k,nx,ny,nz,ldx,ldy,ldz,ndat,istwf_k,mgfft 1031 !arrays 1032 integer,intent(in) :: gbound(2*mgfft+8,2),kg_k(3,npw_k) 1033 real(dp),target,intent(in) :: ug(2*npw_k*ndat) 1034 real(dp),target,intent(inout) :: ur(2*ldx*ldy*ldz*ndat) 1035 integer,optional,intent(in) :: isign, iscale 1036 1037 #ifdef HAVE_FFTW3 1038 !Local variables------------------------------- 1039 !scalars 1040 integer,parameter :: dist=2 1041 integer :: iscale__, isign__ 1042 real(dp) :: fofgout(2,0) 1043 real(dp),ABI_CONTIGUOUS pointer :: real_ug(:,:),real_ur(:,:) 1044 1045 ! ************************************************************************* 1046 1047 iscale__ = 0; if (present(iscale)) iscale__ = iscale 1048 isign__ = +1; if (present(isign)) isign__ = isign 1049 1050 #undef TK_PREF 1051 #define TK_PREF(name) CONCAT(cg_,name) 1052 1053 #undef FFT_PRECISION 1054 #define FFT_PRECISION FFT_DOUBLE 1055 1056 #include "fftug.finc" 1057 1058 #undef FFT_PRECISION 1059 1060 #else 1061 ! Silence compiler warning 1062 ABI_ERROR("FFT_FFTW3 support not activated") 1063 ABI_UNUSED((/fftalg,fftcache,npw_k,nx,ny,nz,ldx,ldy,ldz,ndat,istwf_k,mgfft,kg_k(1,1),gbound(1,1),iscale,isign/)) 1064 ABI_UNUSED((/ug(1),ur(1)/)) 1065 #endif 1066 1067 end subroutine fftw3_fftug_dp
m_fftw3/fftw3_fftug_dpc [ Functions ]
[ Top ] [ m_fftw3 ] [ Functions ]
NAME
fftw3_fftug_dpc
FUNCTION
Compute ndat zero-padded FFTs. Mainly used for the transform of wavefunctions. TARGET: DPC arrays See fftw3_fftug_dp for API docs.
SOURCE
1146 subroutine fftw3_fftug_dpc(fftalg, fftcache, npw_k, nx, ny, nz, ldx, ldy, ldz, ndat, & 1147 istwf_k, mgfft, kg_k, gbound, ug, ur, & 1148 isign, iscale) ! optional 1149 1150 !Arguments ------------------------------------ 1151 !scalars 1152 integer,intent(in) :: fftalg,fftcache 1153 integer,intent(in) :: npw_k,nx,ny,nz,ldx,ldy,ldz,ndat,istwf_k,mgfft 1154 !arrays 1155 integer,intent(in) :: gbound(2*mgfft+8,2),kg_k(3,npw_k) 1156 complex(dpc),target,intent(in) :: ug(npw_k*ndat) 1157 complex(dpc),target,intent(inout) :: ur(ldx*ldy*ldz*ndat) 1158 integer,optional,intent(in) :: isign, iscale 1159 1160 #ifdef HAVE_FFTW3 1161 !Local variables------------------------------- 1162 !scalars 1163 integer,parameter :: dist=1 1164 integer :: iscale__, isign__ 1165 !arrays 1166 real(dp) :: fofgout(2,0) 1167 real(dp),ABI_CONTIGUOUS pointer :: real_ug(:,:),real_ur(:,:) 1168 1169 ! ************************************************************************* 1170 1171 iscale__ = 0; if (present(iscale)) iscale__ = iscale 1172 isign__ = +1; if (present(isign)) isign__ = isign 1173 1174 #undef TK_PREF 1175 #define TK_PREF(name) CONCAT(cplx_,name) 1176 1177 #undef FFT_PRECISION 1178 #define FFT_PRECISION FFT_DOUBLE 1179 1180 #include "fftug.finc" 1181 1182 #undef FFT_PRECISION 1183 1184 #else 1185 ! Silence compiler warning 1186 ABI_ERROR("FFTW3 support not activated") 1187 ABI_UNUSED((/fftalg,fftcache,npw_k,nx,ny,nz,ldx,ldy,ldz,ndat,istwf_k,mgfft,kg_k(1,1),gbound(1,1)/)) 1188 ABI_UNUSED((/ug(1),ur(1)/)) 1189 #endif 1190 1191 end subroutine fftw3_fftug_dpc
m_fftw3/fftw3_fftug_spc [ Functions ]
[ Top ] [ m_fftw3 ] [ Functions ]
NAME
fftw3_fftug_spc
FUNCTION
Compute ndat zero-padded FFTs from G-->R. Mainly used for the transform of wavefunctions. TARGET: spc arrays See fftw3_fftug_dp for API docs.
SOURCE
1084 subroutine fftw3_fftug_spc(fftalg, fftcache, npw_k, nx, ny, nz, ldx, ldy, ldz, ndat, & 1085 istwf_k, mgfft, kg_k, gbound, ug, ur, & 1086 isign, iscale) ! optional 1087 1088 !Arguments ------------------------------------ 1089 !scalars 1090 integer,intent(in) :: fftalg,fftcache 1091 integer,intent(in) :: npw_k,nx,ny,nz,ldx,ldy,ldz,ndat,istwf_k,mgfft 1092 integer,optional,intent(in) :: isign, iscale 1093 !arrays 1094 integer,intent(in) :: gbound(2*mgfft+8,2),kg_k(3,npw_k) 1095 complex(spc),target,intent(in) :: ug(npw_k*ndat) 1096 complex(spc),target,intent(inout) :: ur(ldx*ldy*ldz*ndat) 1097 1098 #ifdef HAVE_FFTW3 1099 !Local variables------------------------------- 1100 !scalars 1101 integer,parameter :: dist=1 1102 integer :: iscale__, isign__ 1103 !arrays 1104 real(sp) :: fofgout(2,0) 1105 real(sp),ABI_CONTIGUOUS pointer :: real_ug(:,:),real_ur(:,:) 1106 1107 ! ************************************************************************* 1108 1109 iscale__ = 0; if (present(iscale)) iscale__ = iscale 1110 isign__ = +1; if (present(isign)) isign__ = isign 1111 1112 #undef TK_PREF 1113 #define TK_PREF(name) CONCAT(cplx_,name) 1114 1115 #undef FFT_PRECISION 1116 #define FFT_PRECISION FFT_SINGLE 1117 1118 #include "fftug.finc" 1119 1120 #undef FFT_PRECISION 1121 1122 #else 1123 ! Silence compiler warning 1124 ABI_ERROR("FFTW3 support not activated") 1125 ABI_UNUSED((/fftalg,fftcache,npw_k,nx,ny,nz,ldx,ldy,ldz,ndat,istwf_k,mgfft,kg_k(1,1),gbound(1,1)/)) 1126 ABI_UNUSED((/ug(1),ur(1)/)) 1127 #endif 1128 1129 end subroutine fftw3_fftug_spc
m_fftw3/fftw3_fftur_dp [ Functions ]
[ Top ] [ m_fftw3 ] [ Functions ]
NAME
fftw3_fftur_dp
FUNCTION
Compute ndat zero-padded FFTs from R- to G-space . Mainly used for the transform of wavefunctions. TARGET: dp arrays
INPUTS
fftalg=FFT algorith (see input variable) fftcache=size of the cache (kB) npw_k=number of plane waves for this k-point. nx,ny,nz=Number of point along the three directions. ldx,ldy,ldz=Leading dimensions of the array. ndat=Number of transforms istwf_k=Option describing the storage of the wavefunction. mgfft=Max number of FFT divisions (used to dimension gbound) kg_k(3,npw_k)=G-vectors in reduced coordinates gbound(2*mgfft+8,2)=Table for padded-FFT. See sphereboundary. SIDE EFFECT ur(ldx*ldy*ldz*ndat)= In input: wavefunctions in real space. Destroyed in output. Do not use it anymore!
OUTPUT
ug(npw_k*ndat)=wavefunctions in reciprocal space.
SOURCE
1225 subroutine fftw3_fftur_dp(fftalg, fftcache, npw_k, nx, ny, nz, ldx, ldy, ldz, ndat, & 1226 istwf_k, mgfft, kg_k, gbound, ur, ug, & 1227 isign, iscale) ! optional 1228 1229 !Arguments ------------------------------------ 1230 !scalars 1231 integer,intent(in) :: fftalg,fftcache 1232 integer,intent(in) :: npw_k,nx,ny,nz,ldx,ldy,ldz,ndat,istwf_k,mgfft 1233 integer,optional,intent(in) :: isign, iscale 1234 !arrays 1235 integer,intent(in) :: gbound(2*mgfft+8,2),kg_k(3,npw_k) 1236 real(dp),target,intent(inout) :: ur(2*ldx*ldy*ldz*ndat) 1237 real(dp),target,intent(inout) :: ug(2*npw_k*ndat) 1238 1239 #ifdef HAVE_FFTW3 1240 !Local variables------------------------------- 1241 !scalars 1242 integer,parameter :: dist=2 1243 integer :: iscale__, isign__ 1244 !arrays 1245 real(dp) :: dum_ugin(2,0) 1246 real(dp),ABI_CONTIGUOUS pointer :: real_ug(:,:),real_ur(:,:) 1247 1248 ! ************************************************************************* 1249 1250 iscale__ = 1; if (present(iscale)) iscale__ = iscale 1251 isign__ = -1; if (present(isign)) isign__ = isign 1252 1253 #undef TK_PREF 1254 #define TK_PREF(name) CONCAT(cg_,name) 1255 1256 #undef FFT_PRECISION 1257 #define FFT_PRECISION FFT_DOUBLE 1258 1259 #include "fftur.finc" 1260 1261 #undef FFT_PRECISION 1262 1263 #else 1264 ! Silence compiler warning 1265 ABI_ERROR("FFTW3 support not activated") 1266 ABI_UNUSED((/fftalg,fftcache/)) 1267 ABI_UNUSED((/npw_k,nx,ny,nz,ldx,ldy,ldz,ndat,istwf_k,mgfft,kg_k(1,1),gbound(1,1)/)) 1268 ABI_UNUSED((/ug(1),ur(1)/)) 1269 #endif 1270 1271 end subroutine fftw3_fftur_dp
m_fftw3/fftw3_fftur_dpc [ Functions ]
[ Top ] [ m_fftw3 ] [ Functions ]
NAME
fftw3_fftur_dpc
FUNCTION
Compute ndat zero-padded FFTs from R ro G. Mainly used for the transform of wavefunctions. TARGET: DPC arrays See fftw3_fftur_dp for API doc.
SOURCE
1351 subroutine fftw3_fftur_dpc(fftalg, fftcache, npw_k, nx, ny, nz, ldx, ldy, ldz, ndat, & 1352 istwf_k, mgfft, kg_k, gbound, ur, ug, & 1353 isign, iscale) ! optional 1354 1355 !Arguments ------------------------------------ 1356 !scalars 1357 integer,intent(in) :: fftalg,fftcache 1358 integer,intent(in) :: npw_k,nx,ny,nz,ldx,ldy,ldz,ndat,istwf_k,mgfft 1359 integer,optional,intent(in) :: isign, iscale 1360 !arrays 1361 integer,intent(in) :: gbound(2*mgfft+8,2),kg_k(3,npw_k) 1362 complex(dpc),target,intent(inout) :: ur(ldx*ldy*ldz*ndat) 1363 complex(dpc),target,intent(inout) :: ug(npw_k*ndat) 1364 1365 #ifdef HAVE_FFTW3 1366 !Local variables------------------------------- 1367 !scalars 1368 integer,parameter :: dist=1 1369 integer :: iscale__, isign__ 1370 !arrays 1371 real(dp) :: dum_ugin(2,0) 1372 real(dp),ABI_CONTIGUOUS pointer :: real_ug(:,:),real_ur(:,:) 1373 1374 ! ************************************************************************* 1375 1376 iscale__ = 1; if (present(iscale)) iscale__ = iscale 1377 isign__ = -1; if (present(isign)) isign__ = isign 1378 1379 #undef TK_PREF 1380 #define TK_PREF(name) CONCAT(cplx_,name) 1381 1382 #undef FFT_PRECISION 1383 #define FFT_PRECISION FFT_DOUBLE 1384 1385 #include "fftur.finc" 1386 1387 #undef FFT_PRECISION 1388 1389 #else 1390 ! Silence compiler warning 1391 ABI_ERROR("FFTW3 support not activated") 1392 ABI_UNUSED((/fftalg,fftcache/)) 1393 ABI_UNUSED((/npw_k,nx,ny,nz,ldx,ldy,ldz,ndat,istwf_k,mgfft,kg_k(1,1),gbound(1,1)/)) 1394 ABI_UNUSED((/ug(1),ur(1)/)) 1395 #endif 1396 1397 end subroutine fftw3_fftur_dpc
m_fftw3/fftw3_fftur_spc [ Functions ]
[ Top ] [ m_fftw3 ] [ Functions ]
NAME
fftw3_fftur_spc
FUNCTION
Compute ndat zero-padded FFTs from R- to G-space . Mainly used for the transform of wavefunctions. TARGET: spc arrays See fftw3_fftur_dp for API doc.
SOURCE
1288 subroutine fftw3_fftur_spc(fftalg, fftcache, npw_k, nx, ny, nz, ldx, ldy, ldz, ndat, & 1289 istwf_k, mgfft, kg_k, gbound, ur, ug, & 1290 isign, iscale) ! optional 1291 1292 !Arguments ------------------------------------ 1293 !scalars 1294 integer,intent(in) :: fftalg,fftcache 1295 integer,intent(in) :: npw_k,nx,ny,nz,ldx,ldy,ldz,ndat,istwf_k,mgfft 1296 integer,optional,intent(in) :: isign, iscale 1297 !arrays 1298 integer,intent(in) :: gbound(2*mgfft+8,2),kg_k(3,npw_k) 1299 complex(spc),target,intent(inout) :: ur(ldx*ldy*ldz*ndat) 1300 complex(spc),target,intent(inout) :: ug(npw_k*ndat) 1301 1302 #ifdef HAVE_FFTW3 1303 !Local variables------------------------------- 1304 !scalars 1305 integer,parameter :: dist=1 1306 integer :: iscale__, isign__ 1307 !arrays 1308 real(sp) :: dum_ugin(2,0) 1309 real(sp),ABI_CONTIGUOUS pointer :: real_ug(:,:),real_ur(:,:) 1310 1311 ! ************************************************************************* 1312 1313 iscale__ = 1; if (present(iscale)) iscale__ = iscale 1314 isign__ = -1; if (present(isign)) isign__ = isign 1315 1316 #undef TK_PREF 1317 #define TK_PREF(name) CONCAT(cplx_,name) 1318 1319 #undef FFT_PRECISION 1320 #define FFT_PRECISION FFT_SINGLE 1321 1322 #include "fftur.finc" 1323 1324 #undef FFT_PRECISION 1325 1326 #else 1327 ! Silence compiler warning 1328 ABI_ERROR("FFTW3 support not activated") 1329 ABI_UNUSED((/fftalg,fftcache/)) 1330 ABI_UNUSED((/npw_k,nx,ny,nz,ldx,ldy,ldz,ndat,istwf_k,mgfft,kg_k(1,1),gbound(1,1)/)) 1331 ABI_UNUSED((/ug(1),ur(1)/)) 1332 #endif 1333 1334 end subroutine fftw3_fftur_spc
m_fftw3/fftw3_init_threads [ Functions ]
[ Top ] [ m_fftw3 ] [ Functions ]
NAME
fftw3_init_threads
FUNCTION
This function performs the one-time initialization required to use FFTW3 threads. It does nothing if HAVE_FFT_FFTW3_THREADS is not defined.
INPUTS
SIDE EFFECTS
The one-time initialization required to use FFTW3 threads is performed when the routine is called for the first time.
SOURCE
2276 subroutine fftw3_init_threads() 2277 2278 !Local variables ------------------------------ 2279 !scalars 2280 #ifdef HAVE_FFTW3_THREADS 2281 integer :: iret 2282 #endif 2283 2284 ! ************************************************************************* 2285 2286 #ifdef HAVE_FFTW3_THREADS 2287 if (THREADS_INITED==0) then 2288 !call wrtout(std_out,"Calling dfftw_init_threads()") 2289 call dfftw_init_threads(iret) 2290 2291 if (iret==0) then 2292 ABI_WARNING(" dfftw_init_threads returned 0; threaded FFTW3 is not being used!") 2293 else 2294 THREADS_INITED=1 2295 end if 2296 call fftw3_set_nthreads() 2297 end if 2298 2299 #ifndef HAVE_OPENMP 2300 ABI_WARNING("Using FFTW3 with threads but HAVE_OPENMP is not defined!") 2301 #endif 2302 #endif 2303 2304 #ifdef HAVE_FFTW3_MPI 2305 !call wrtout(std_out,"Calling fftw_mpi_init()") 2306 call fftw_mpi_init() 2307 #endif 2308 2309 end subroutine fftw3_init_threads
m_fftw3/fftw3_many_dft_ip [ Functions ]
[ Top ] [ m_fftw3 ] [ Functions ]
NAME
fftw3_many_dft_ip
FUNCTION
Driver routine for many in-place 3D complex-to-complex FFTs of lengths nx, ny, nz.
INPUTS
nx,ny,nz=Number of points along the three directions. ldx,ldy,ldz=Physical dimension of the finout array (to avoid cache conflicts). ndat=Number of FFTs to be done. isign=sign of Fourier transform exponent: current convention uses +1 for transforming from G to r, -1 for transforming from r to G. [fftw_flags]=Flags used to create the plan. They can be combined with the "+" operator. Defaults to ABI_FFTW_ESTIMATE.
OUTPUT
finout(2,ldx*ldy*ldz*ndat)= In input: The complex array to be transformed. In output: The FFT results.
SOURCE
2132 subroutine fftw3_many_dft_ip(nx,ny,nz,ldx,ldy,ldz,ndat,isign,finout,fftw_flags) 2133 2134 !Arguments ------------------------------------ 2135 !scalars 2136 integer,intent(in) :: nx,ny,nz,ldx,ldy,ldz,ndat,isign 2137 integer,optional,intent(in) :: fftw_flags 2138 !arrays 2139 real(dp),intent(inout) :: finout(2*ldx*ldy*ldz*ndat) 2140 2141 #ifdef HAVE_FFTW3 2142 !Local variables------------------------------- 2143 !scalars 2144 integer,parameter :: rank3=3,nt_all=-1 2145 integer :: my_flags,dist,stride 2146 integer(KIND_FFTW_PLAN) :: my_plan 2147 !arrays 2148 integer :: embed(rank3),n(rank3) 2149 2150 ! ************************************************************************* 2151 2152 my_flags=ABI_FFTW_ESTIMATE; if (PRESENT(fftw_flags)) my_flags= fftw_flags 2153 2154 stride = 1 2155 dist = ldx*ldy*ldz 2156 embed = [ldx, ldy, ldz] 2157 n = [nx, ny, nz] 2158 2159 my_plan = fftw3_plan_many_dft(rank3, n, ndat, finout, embed, stride, dist, finout, embed, stride, dist, isign, my_flags, nt_all) 2160 2161 ! Now perform the 3D FFT via FFTW. 2162 call dfftw_execute_dft(my_plan, finout, finout) 2163 call fftw3_destroy_plan(my_plan) 2164 2165 ! -1, FFTW returns not normalized FTs 2166 if (isign == ABI_FFTW_FORWARD) then 2167 call ZDSCAL(ldx*ldy*ldz*ndat, one/(nx*ny*nz), finout, 1) 2168 end if 2169 2170 #else 2171 ABI_ERROR("FFTW3 support not activated") 2172 ABI_UNUSED((/nx,ny,nz,ldx,ldy,ldz,ndat,isign/)) 2173 if (PRESENT(fftw_flags)) then 2174 ABI_UNUSED(fftw_flags) 2175 end if 2176 ABI_UNUSED(finout(1)) 2177 #endif 2178 2179 end subroutine fftw3_many_dft_ip
m_fftw3/fftw3_many_dft_op [ Functions ]
[ Top ] [ m_fftw3 ] [ Functions ]
NAME
fftw3_many_dft_op
FUNCTION
Driver routine for many out-of-place 3D complex-to-complex FFTs of lengths nx, ny, nz.
INPUTS
nx,ny,nz=Number of points along the three directions. ldx,ldy,ldz=Physical dimension of the fin and fout arrays (to avoid cache conflicts). ndat=Number of FFTs to be done. fin(2*ldx*ldy*ldz*ndat)=The complex array to be transformed. isign=sign of Fourier transform exponent: current convention uses +1 for transforming from G to r, -1 for transforming from r to G. [fftw_flags]=Flags used to create the plan. They can be combined with the "+" operator. Defaults to ABI_FFTW_ESTIMATE.
OUTPUT
fout(2,ldx*ldy*ldz*ndat)=The Fourier transform of fin.
SOURCE
2053 subroutine fftw3_many_dft_op(nx,ny,nz,ldx,ldy,ldz,ndat,isign,fin,fout,fftw_flags) 2054 2055 !Arguments ------------------------------------ 2056 !scalars 2057 integer,intent(in) :: nx,ny,nz,ldx,ldy,ldz,ndat,isign 2058 integer,optional,intent(in) :: fftw_flags 2059 !arrays 2060 real(dp),intent(in) :: fin(2*ldx*ldy*ldz*ndat) 2061 real(dp),intent(out) :: fout(2*ldx*ldy*ldz*ndat) 2062 2063 #ifdef HAVE_FFTW3 2064 !Local variables------------------------------- 2065 !scalars 2066 integer,parameter :: rank3=3,nt_all=-1 2067 integer :: my_flags,dist,stride 2068 integer(KIND_FFTW_PLAN) :: my_plan 2069 !arrays 2070 integer :: embed(rank3),n(rank3) 2071 2072 ! ************************************************************************* 2073 2074 my_flags=ABI_FFTW_ESTIMATE; if (PRESENT(fftw_flags)) my_flags= fftw_flags 2075 2076 stride = 1 2077 dist = ldx*ldy*ldz 2078 embed = [ldx, ldy, ldz] 2079 n = [nx, ny, nz] 2080 2081 my_plan = fftw3_plan_many_dft(rank3, n, ndat, fin, embed, stride, dist, fout, embed, stride, dist, isign, my_flags, nt_all) 2082 2083 ! Now perform the 3D FFT via FFTW. 2084 call dfftw_execute_dft(my_plan, fin, fout) 2085 2086 call fftw3_destroy_plan(my_plan) 2087 2088 ! -1, FFTW returns not normalized FTs 2089 if (isign == ABI_FFTW_FORWARD) then 2090 call ZDSCAL(ldx*ldy*ldz*ndat, one/(nx*ny*nz), fout, 1) 2091 end if 2092 2093 #else 2094 ABI_ERROR("FFTW3 support not activated") 2095 ABI_UNUSED((/nx,ny,nz,ldx,ldy,ldz,ndat,isign/)) 2096 if (PRESENT(fftw_flags)) then 2097 ABI_UNUSED(fftw_flags) 2098 end if 2099 ABI_UNUSED(fin(1)) 2100 ABI_UNUSED(fout(1)) 2101 #endif 2102 2103 end subroutine fftw3_many_dft_op
m_fftw3/fftw3_mpiback [ Functions ]
[ Top ] [ m_fftw3 ] [ Functions ]
NAME
fftw3_mpiback
FUNCTION
CALCULATES THE DISCRETE FOURIER TRANSFORM in parallel using MPI/OpenMP ZR(I1,I2,I3)= \sum_(j1,j2,j3) EXP(isign*i*2*pi*(j1*i1/n1+j2*i2/n2+j3*i3/n3)) ZF(j1,j3,j2) Adopt standard convention that isign=1 for backward transform INPUTS: option= 1 if call from fourwf, 2 if call from other routine cplex=1 for real --> complex, 2 for complex --> complex ZF: input array in G-space (note the switch of i2 and i3) real(F(i1,i3,i2,idat))=ZF(1,i1,i3,i2,idat) imag(F(i1,i3,i2,idat))=ZF(2,i1,i3,i2,idat) i1=1,n1 , i2=1,n2 , i3=1,n3 , idat=1,ndat OUTPUTS: ZR: output array in R space. ZR(1,i1,i2,i3,idat)=real(R(i1,i2,i3,idat)) ZR(2,i1,i2,i3,idat)=imag(R(i1,i2,i3,idat)) i1=1,n1 , i2=1,n2 , i3=1,n3 , idat=1,ndat nproc_fft: number of processors used as returned by MPI_COMM_SIZE me_fft: [0:nproc_fft-1] number of processor as returned by MPI_COMM_RANK n1,n2,n3: logical dimension of the transform. As transform lengths most products of the prime factors 2,3,5 are allowed. The detailed table with allowed transform lengths can be found in subroutine CTRIG nd1,nd2,nd3: Dimension of ZF and ZR nd2proc=((nd2-1)/nproc_fft)+1 maximal number of 2nd dim slices nd3proc=((nd3-1)/nproc_fft)+1 maximal number of 3rd dim slices NOTES: The maximum number of processors that can reasonably be used is max(n2,n3) It is very important to find the optimal value of NCACHE. NCACHE determines the size of the work array ZW, that has to fit into cache. It has therefore to be chosen to equal roughly half the size of the physical cache in units of real*8 numbers. The optimal value of ncache can easily be determined by numerical experimentation. A too large value of ncache leads to a dramatic and sudden decrease of performance, a too small value to a to a slow and less dramatic decrease of performance. If NCACHE is set to a value so small, that not even a single one dimensional transform can be done in the workarray zw, the program stops with an error message.
SOURCE
4664 subroutine fftw3_mpiback(cplex,ndat,n1,n2,n3,nd1,nd2,nd3,nd1eff,nd2proc,nd3proc,option,zf,zr,comm_fft) 4665 4666 !Arguments ------------------------------------ 4667 ! real space input 4668 integer,intent(in) :: cplex,ndat,n1,n2,n3,nd1,nd2,nd3,nd1eff,nd2proc,nd3proc,option,comm_fft 4669 real(dp),intent(in) :: zf(2,nd1,nd3,nd2proc,ndat) 4670 real(dp),intent(out) :: zr(2,nd1eff,nd2,nd3proc,ndat) 4671 4672 !Local variables------------------------------- 4673 !scalaras 4674 #ifdef HAVE_FFTW3 4675 integer :: j,i1,idat,ierr,includelast,j2,j2st,j3,jeff,jp2st,lzt,nthreads 4676 integer :: ma,mb,n1dfft,n1eff,n2eff,n1zt,ncache,nnd3,nproc_fft,me_fft,lot1,lot2,lot3 4677 integer(KIND_FFTW_PLAN) :: bw_plan1_lot,bw_plan1_rest 4678 integer(KIND_FFTW_PLAN) :: bw_plan2_lot,bw_plan2_rest 4679 integer(KIND_FFTW_PLAN) :: bw_plan3_lot,bw_plan3_rest 4680 character(len=500) :: msg 4681 !arrays 4682 real(dp), allocatable :: zmpi1(:,:,:,:),zmpi2(:,:,:,:) ! work arrays for MPI 4683 real(dp),allocatable :: zw(:,:),zt(:,:,:) ! cache work array and array for transpositions 4684 4685 ! ************************************************************************* 4686 4687 nproc_fft = xmpi_comm_size(comm_fft); me_fft = xmpi_comm_rank(comm_fft) 4688 4689 ! find cache size that gives optimal performance on machine 4690 ncache=2*max(n1,n2,n3,1024) 4691 4692 if (ncache/(2*max(n1,n2,n3))<1) then 4693 write(msg,'(5a)') & 4694 & 'ncache has to be enlarged to be able to hold at',ch10, & 4695 & 'least one 1-d FFT of each size even though this will',ch10,& 4696 & 'reduce the performance for shorter transform lengths' 4697 ABI_ERROR(msg) 4698 end if 4699 4700 ! check input 4701 if (nd1<n1 .or. nd2<n2 .or. nd3<n3) then 4702 ABI_ERROR("nd1<n1 .or. nd2<n2 .or. nd3<n3") 4703 end if 4704 4705 ! Effective n1 and n2 (complex-to-complex or real-to-complex) 4706 n1eff=n1; n2eff=n2; n1zt=n1 4707 if (cplex==1) then 4708 n1eff=(n1+1)/2; n2eff=n2/2+1 ; n1zt=2*(n1/2+1) 4709 end if 4710 4711 lzt=n2eff 4712 if (mod(n2eff,2) == 0) lzt=lzt+1 4713 if (mod(n2eff,4) == 0) lzt=lzt+1 4714 4715 ! maximal number of big box 3rd dim slices for all procs 4716 nnd3=nd3proc*nproc_fft 4717 4718 ABI_MALLOC(zw,(2,ncache/2)) 4719 ABI_MALLOC(zt,(2,lzt,n1zt)) 4720 ABI_MALLOC(zmpi2,(2,n1,nd2proc,nnd3)) 4721 if (nproc_fft>1) then 4722 ABI_MALLOC(zmpi1,(2,n1,nd2proc,nnd3)) 4723 end if 4724 4725 !DEBUG 4726 ! write(std_out,'(a,3i4)' )'back,zf n1,n2,n3',n1,n2,n3 4727 ! write(std_out,'(a,3i4)' )'nd1,nd2,nd3proc',nd1,nd2,nd3proc 4728 ! write(std_out,'(a,3i4)' )'m1,m2,m3',m1,m2,m3 4729 ! write(std_out,'(a,3i4)' )'max1,max2,max3',max1,max2,max3 4730 ! write(std_out,'(a,3i4)' )'md1,md2proc,md3',md1,md2proc,md3 4731 ! write(std_out,'(a,3i4)' )'n1eff,m2eff,m1zt',n1eff,m2eff,m1zt 4732 !ENDDEBUG 4733 4734 ! Create plans. 4735 ! The prototype for sfftw_plan_many_dft is: 4736 ! sfftw_plan_many_dft(rank, n, howmany, 4737 ! fin, iembed, istride, idist, 4738 ! fout, oembed, ostride, odist, isign, my_flags) 4739 4740 lot3=ncache/(2*n3) 4741 lot1=ncache/(2*n1) 4742 lot2=ncache/(2*n2) 4743 4744 nthreads = xomp_get_num_threads(open_parallel=.TRUE.) 4745 !nthreads = 1 4746 4747 bw_plan3_lot = dplan_many_dft_2D(1, [n3], lot3, & 4748 & zw, [ncache/2], lot3, 1, & 4749 & zw, [ncache/2], lot3, 1, ABI_FFTW_BACKWARD, ABI_FFTW_ESTIMATE, nthreads) 4750 4751 if (mod(n1, lot3) /= 0) then 4752 bw_plan3_rest = dplan_many_dft_2D(1, [n3], mod(n1, lot3), & 4753 & zw, [ncache/2], lot3, 1, & 4754 & zw, [ncache/2], lot3, 1, ABI_FFTW_BACKWARD, ABI_FFTW_ESTIMATE, nthreads) 4755 end if 4756 4757 bw_plan1_lot = dplan_many_dft_2D(1, [n1], lot1, & 4758 & zw, [ncache/2], lot1, 1, & 4759 & zt, [lzt, n1zt], lzt, 1, ABI_FFTW_BACKWARD, ABI_FFTW_ESTIMATE, nthreads) 4760 4761 if (mod(n2eff, lot1) /= 0) then 4762 bw_plan1_rest = dplan_many_dft_2D(1, [n1], mod(n2eff, lot1), & 4763 & zw, [ncache/2], lot1, 1, & 4764 & zt, [lzt, n1zt], lzt, 1, ABI_FFTW_BACKWARD, ABI_FFTW_ESTIMATE, nthreads) 4765 end if 4766 4767 bw_plan2_lot = dplan_many_dft_2D(1, [n2], lot2, & 4768 & zw, [ncache/2], lot2, 1, & 4769 & zr, [nd1eff,nd2,nd3proc,ndat], nd1eff, 1, ABI_FFTW_BACKWARD, ABI_FFTW_ESTIMATE, nthreads) 4770 4771 if (mod(n1eff, lot2) /= 0) then 4772 bw_plan2_rest = dplan_many_dft_2D(1, [n2], mod(n1eff,lot2), & 4773 & zw, [ncache/2], lot2, 1, & 4774 & zr, [nd1eff,nd2,nd3proc,ndat], nd1eff, 1, ABI_FFTW_BACKWARD, ABI_FFTW_ESTIMATE, nthreads) 4775 end if 4776 4777 do idat=1,ndat 4778 ! transform along z axis 4779 ! input: I1,I3,J2,(Jp2) 4780 4781 do j2=1,nd2proc 4782 if (me_fft*nd2proc+j2 <= n2eff) then 4783 4784 do i1=1,n1,lot3 4785 ma=i1 4786 mb=min(i1+(lot3-1),n1) 4787 n1dfft=mb-ma+1 4788 4789 ! input: G1,G3,G2,(Gp2) 4790 ! output: G1,R3,G2,(Gp2) 4791 call fill(nd1,nd3,lot3,n1dfft,n3,zf(1,i1,1,j2,idat),zw) 4792 4793 if (n1dfft == lot3) then 4794 call dfftw_execute_dft(bw_plan3_lot, zw, zw) 4795 else 4796 call dfftw_execute_dft(bw_plan3_rest, zw, zw) 4797 end if 4798 4799 ! input: G1,R3,G2,(Gp2) 4800 ! output: G1,G2,R3,(Gp2) 4801 call scramble(i1,j2,lot3,n1dfft,n1,n3,nd2proc,nd3,zw,zmpi2) 4802 end do 4803 end if 4804 end do 4805 4806 ! Interprocessor data transposition 4807 ! input: G1,G2,R3,Rp3,(Gp2) 4808 ! output: G1,G2,G3,Gp2,(Rp3) 4809 if (nproc_fft>1) then 4810 call xmpi_alltoall(zmpi2,2*n1*nd2proc*nd3proc, & 4811 & zmpi1,2*n1*nd2proc*nd3proc,comm_fft,ierr) 4812 end if 4813 4814 do j3=1,nd3proc 4815 if (me_fft*nd3proc+j3 <= n3) then 4816 Jp2st=1; J2st=1 4817 4818 ! transform along x axis 4819 do j=1,n2eff,lot1 4820 ma=j 4821 mb=min(j+(lot1-1),n2eff) 4822 n1dfft=mb-ma+1 4823 4824 ! input: G1,G2,R3,Gp2,(Rp3) 4825 ! output: G2,G1,R3,Jp2,(Rp3) 4826 if (nproc_fft == 1) then 4827 call mpiswitch(j3,n1dfft,Jp2st,J2st,lot1,n1,nd2proc,nd3proc,nproc_fft,option,zmpi2,zw) 4828 else 4829 call mpiswitch(j3,n1dfft,Jp2st,J2st,lot1,n1,nd2proc,nd3proc,nproc_fft,option,zmpi1,zw) 4830 end if 4831 4832 ! input: G2,G1,R3,(Rp3) 4833 ! output: G2,R1,R3,(Rp3) 4834 if (n1dfft == lot1) then 4835 call dfftw_execute_dft(bw_plan1_lot, zw, zt(1,j,1)) 4836 else 4837 call dfftw_execute_dft(bw_plan1_rest, zw, zt(1,j,1)) 4838 end if 4839 4840 end do 4841 4842 ! transform along y axis 4843 do j=1,n1eff,lot2 4844 ma=j 4845 mb=min(j+(lot2-1),n1eff) 4846 n1dfft=mb-ma+1 4847 includelast=1 4848 if (cplex==1) then 4849 jeff=2*j-1 4850 includelast=1 4851 if (mb==n1eff .and. n1eff*2/=n1) includelast=0 4852 end if 4853 4854 ! input: G2,R1,R3,(Rp3) 4855 ! output: R1,G2,R3,(Rp3) 4856 if (cplex==2) then 4857 call switch(n1dfft,n2,lot2,n1,lzt,zt(1,1,j),zw) 4858 else 4859 call switchreal(includelast,n1dfft,n2,n2eff,lot2,n1zt,lzt,zt(1,1,jeff),zw) 4860 end if 4861 4862 if (n1dfft == lot2) then 4863 call dfftw_execute_dft(bw_plan2_lot, zw, zr(1,j,1,j3,idat)) 4864 else 4865 call dfftw_execute_dft(bw_plan2_rest, zw, zr(1,j,1,j3,idat)) 4866 end if 4867 end do 4868 ! output: R1,R2,R3,(Rp3) 4869 4870 end if 4871 end do 4872 end do ! idat 4873 4874 call dfftw_destroy_plan(bw_plan3_lot) 4875 if (mod(n1, lot3) /= 0) then 4876 call dfftw_destroy_plan(bw_plan3_rest) 4877 end if 4878 4879 call dfftw_destroy_plan(bw_plan1_lot) 4880 if (mod(n2eff, lot1) /= 0) then 4881 call dfftw_destroy_plan(bw_plan1_rest) 4882 end if 4883 4884 call dfftw_destroy_plan(bw_plan2_lot) 4885 if (mod(n1eff, lot2) /= 0) then 4886 call dfftw_destroy_plan(bw_plan2_rest) 4887 end if 4888 4889 ABI_FREE(zmpi2) 4890 ABI_FREE(zw) 4891 ABI_FREE(zt) 4892 if (nproc_fft>1) then 4893 ABI_FREE(zmpi1) 4894 end if 4895 4896 #else 4897 ABI_ERROR("FFTW3 support not activated") 4898 ABI_UNUSED((/cplex,ndat,n1,n2,n3,nd1,nd2,nd1eff,nd2proc,nd3proc,option,comm_fft/)) 4899 ABI_UNUSED((/zf(1,1,1,1,1),zr(1,1,1,1,1)/)) 4900 #endif 4901 4902 end subroutine fftw3_mpiback
m_fftw3/fftw3_mpiback_manywf [ Functions ]
[ Top ] [ m_fftw3 ] [ Functions ]
NAME
fftw3_mpiback_manywf
FUNCTION
Does multiple 3-dim backward FFTs from Fourier into real space Adopt standard convention that isign=1 for backward transform CALCULATES THE DISCRETE FOURIER TRANSFORM ZF(I1,I2,I3)= S_(j1,j2,j3) EXP(isign*i*2*pi*(j1*i1/n1+j2*i2/n2+j3*i3/n3)) ZF(j1,j3,j2) in parallel using MPI/OpenMP. INPUTS: cplexwf=1 if wavefunction is real, 2 if complex ndat=Number of wavefunctions to transform. n1,n2,n3: logical dimension of the transform. As transform lengths most products of the prime factors 2,3,5 are allowed. The detailed table with allowed transform lengths can be found in subroutine CTRIG nd1,nd2,nd3: Leading Dimension of ZR nd3proc=((nd3-1)/nproc_fft)+1 maximal number of big box 3rd dim slices for one proc max1 is positive or zero; m1 >=max1+1 i1= 1... max1+1 corresponds to positive and zero wavevectors 0 ... max1 then, if m1 > max1+1, one has min1=max1-m1+1 and i1= max1+2 ... m1 corresponds to negative wavevectors min1 ... -1 max2 and max3 have a similar definition of range m1,m2,m3=Size of the box enclosing the G-sphere. md1,md2,md3: Dimension of ZF given on the **small** FFT box. md2proc=((md2-1)/nproc_fft)+1 maximal number of small box 2nd dim slices for one proc nproc_fft: number of processors used as returned by MPI_COMM_SIZE comm_fft=MPI communicator for the FFT. ZF: input array (note the switch of i2 and i3) real(F(i1,i3,i2,idat))=ZF(1,i1,i3,i2,idat) imag(F(i1,i3,i2,idat))=ZF(2,i1,i3,i2,idat) OUTPUTS ZR: output array ZR(1,i1,i2,i3,idat)=real(R(i1,i2,i3,idat)) ZR(2,i1,i2,i3,idat)=imag(R(i1,i2,i3,idat)) i1=1,n1 , i2=1,n2 , i3=1,n3 , idat=1,ndat
NOTES
The maximum number of processors that can reasonably be used is max(n2/2,n3/2) It is very important to find the optimal value of NCACHE. NCACHE determines the size of the work array ZW, that has to fit into cache. It has therefore to be chosen to equal roughly half the size of the physical cache in units of real*8 numbers. The optimal value of ncache can easily be determined by numerical experimentation. A too large value of ncache leads to a dramatic and sudden decrease of performance, a too small value to a to a slow and less dramatic decrease of performance. If NCACHE is set to a value so small, that not even a single one dimensional transform can be done in the workarray zw, the program stops with an error message.
SOURCE
6116 subroutine fftw3_mpiback_manywf(cplexwf,ndat,n1,n2,n3,nd1,nd2,nd3proc,& 6117 & max1,max2,max3,m1,m2,m3,md1,md2proc,md3,zf,zr,comm_fft) 6118 6119 !Arguments ------------------------------------ 6120 integer,intent(in) :: cplexwf,ndat,n1,n2,n3,nd1,nd2,nd3proc 6121 integer,intent(in) :: max1,max2,max3,m1,m2,m3,md1,md2proc,md3,comm_fft 6122 real(dp),intent(in) :: zf(2,md1,md3,md2proc,ndat) 6123 real(dp),intent(out) :: zr(2,nd1,nd2,nd3proc,ndat) 6124 6125 #ifdef HAVE_FFTW3 6126 !Local variables------------------------------- 6127 integer,parameter :: nt1=1 6128 integer :: j,i1,i2,idat,ierr,includelast,nthreads 6129 integer :: ioption,j2,j3,j2st,jp2st,jeff,lzt,m1zt,ma,mb,n1dfft,nnd3 6130 integer :: lot1,lot2,lot3 6131 integer :: m2eff,ncache,n1eff,n1half,nproc_fft,me_fft 6132 integer(KIND_FFTW_PLAN) :: bw_plan1_lot,bw_plan1_rest 6133 integer(KIND_FFTW_PLAN) :: bw_plan2_lot,bw_plan2_rest 6134 integer(KIND_FFTW_PLAN) :: bw_plan3_lot,bw_plan3_rest 6135 !type(C_PTR) :: zw_cptr,zt_cptr 6136 character(len=500) :: msg 6137 !arrays 6138 integer :: requests(ndat) 6139 real(dp) ABI_ASYNC, allocatable :: zmpi1(:,:,:,:,:),zmpi2(:,:,:,:,:) ! work arrays for MPI 6140 real(dp),allocatable :: zw(:,:),zt(:,:,:) ! cache work array and array for transpositions 6141 !real(dp),ABI_CONTIGUOUS pointer :: zw(:,:),zt(:,:,:) 6142 ! FFT work arrays 6143 real(dp) :: tsec(2) 6144 6145 ! ************************************************************************* 6146 6147 !call wrtout(std_out,"mpiback with non-blocking IALLTOALL + FFTW3") 6148 6149 6150 ! FIXME must provide a default value but which one? 6151 ! ioption = 0 6152 ioption = 1 6153 !if (paral_kgb==1) ioption=1 6154 6155 nproc_fft = xmpi_comm_size(comm_fft); me_fft = xmpi_comm_rank(comm_fft) 6156 6157 ! Find cache size that gives optimal performance on machine 6158 ncache=2*max(n1,n2,n3,1024) 6159 if (ncache/(2*max(n1,n2,n3))<1) then 6160 write(msg,"(5a)") & 6161 & 'ncache has to be enlarged to be able to hold at',ch10, & 6162 & 'least one 1-d FFT of each size even though this will',ch10,& 6163 & 'reduce the performance for shorter transform lengths' 6164 ABI_ERROR(msg) 6165 end if 6166 6167 ! Effective m1 and m2 (complex-to-complex or real-to-complex) 6168 n1eff=n1; m2eff=m2; m1zt=n1 6169 if (cplexwf==1) then 6170 n1eff=(n1+1)/2; m2eff=m2/2+1; m1zt=2*(n1/2+1) 6171 end if 6172 6173 lzt=m2eff 6174 if (mod(m2eff,2)==0) lzt=lzt+1 6175 if (mod(m2eff,4)==0) lzt=lzt+1 6176 6177 ! maximal number of big box 3rd dim slices for all procs 6178 nnd3=nd3proc*nproc_fft 6179 6180 ! Allocate cache work array and work arrays for MPI transpositions. 6181 ABI_MALLOC(zw,(2,ncache/2)) 6182 ABI_MALLOC(zt,(2,lzt,m1zt)) 6183 6184 !call fftw3_alloc_real([2,ncache/2],zw_cptr,zw) 6185 !call fftw3_alloc_real([2,lzt,m1zt],zt_cptr,zt) 6186 6187 ABI_MALLOC(zmpi2,(2,md1,md2proc,nnd3,ndat)) 6188 if (nproc_fft>1) then 6189 ABI_MALLOC(zmpi1,(2,md1,md2proc,nnd3,ndat)) 6190 end if 6191 6192 ! Create plans. 6193 ! The prototype for sfftw_plan_many_dft is: 6194 ! sfftw_plan_many_dft(rank, n, howmany, 6195 ! fin, iembed, istride, idist, 6196 ! fout, oembed, ostride, odist, isign, my_flags) 6197 6198 lot3=ncache/(2*n3) 6199 lot1=ncache/(2*n1) 6200 lot2=ncache/(2*n2) 6201 6202 nthreads = xomp_get_num_threads(open_parallel=.TRUE.) 6203 !nthreads = 1 6204 6205 bw_plan3_lot = dplan_many_dft_2D(1, [n3], lot3, & 6206 & zw, [ncache/2], lot3, 1, & 6207 & zw, [ncache/2], lot3, 1, ABI_FFTW_BACKWARD, ABI_FFTW_ESTIMATE, nthreads) 6208 6209 if (mod(m1, lot3) /= 0) then 6210 bw_plan3_rest = dplan_many_dft_2D(1, [n3], mod(m1, lot3), & 6211 & zw, [ncache/2], lot3, 1, & 6212 & zw, [ncache/2], lot3, 1, ABI_FFTW_BACKWARD, ABI_FFTW_ESTIMATE, nthreads) 6213 end if 6214 6215 bw_plan1_lot = dplan_many_dft_2D(1, [n1], lot1, & 6216 & zw, [ncache/2], lot1, 1, & 6217 & zt, [lzt, m1zt], lzt, 1, ABI_FFTW_BACKWARD, ABI_FFTW_ESTIMATE, nthreads) 6218 6219 if (mod(m2eff, lot1) /= 0) then 6220 bw_plan1_rest = dplan_many_dft_2D(1, [n1], mod(m2eff, lot1), & 6221 & zw, [ncache/2], lot1, 1, & 6222 & zt, [lzt, m1zt], lzt, 1, ABI_FFTW_BACKWARD, ABI_FFTW_ESTIMATE, nthreads) 6223 end if 6224 6225 bw_plan2_lot = dplan_many_dft_2D(1, [n2], lot2, & 6226 & zw, [ncache/2], lot2, 1, & 6227 & zr, [nd1,nd2,nd3proc,ndat], nd1, 1, ABI_FFTW_BACKWARD, ABI_FFTW_ESTIMATE, nthreads) 6228 6229 if (mod(n1eff, lot2) /= 0) then 6230 bw_plan2_rest = dplan_many_dft_2D(1, [n2], mod(n1eff,lot2), & 6231 & zw, [ncache/2], lot2, 1, & 6232 & zr, [nd1,nd2,nd3proc,ndat], nd1, 1, ABI_FFTW_BACKWARD, ABI_FFTW_ESTIMATE, nthreads) 6233 end if 6234 6235 do idat=1,ndat 6236 ! transform along z axis 6237 ! input: G1,G3,G2,(Gp2) 6238 6239 ! Loop over the y planes treated by this node and trasform n1ddft G_z lines. 6240 do j2=1,md2proc 6241 ! if (me_fft*md2proc+j2<=m2eff) then !a faire plus tard 6242 do i1=1,m1,lot3 6243 ma=i1 6244 mb=min(i1+(lot3-1),m1) 6245 n1dfft=mb-ma+1 6246 6247 ! zero-pad n1dfft G_z lines 6248 ! input: G1,G3,G2,(Gp2) 6249 ! output: G1,R3,G2,(Gp2) 6250 call fill_cent(md1,md3,lot3,n1dfft,max3,m3,n3,zf(1,i1,1,j2,idat),zw) 6251 6252 ! Transform along z. 6253 if (n1dfft == lot3) then 6254 call dfftw_execute_dft(bw_plan3_lot, zw, zw) 6255 else 6256 call dfftw_execute_dft(bw_plan3_rest, zw, zw) 6257 end if 6258 6259 ! Local rotation. 6260 ! input: G1,R3,G2,(Gp2) 6261 ! output: G1,G2,R3,(Gp2) 6262 call scramble(i1,j2,lot3,n1dfft,md1,n3,md2proc,nnd3,zw,zmpi2(:,:,:,:,idat)) 6263 end do 6264 end do ! j2 6265 6266 ! Interprocessor data transposition 6267 ! input: G1,G2,R3,Rp3,(Gp2) 6268 ! output: G1,G2,R3,Gp2,(Rp3) 6269 if (nproc_fft>1) then 6270 call timab(543,1,tsec) 6271 call xmpi_ialltoall(zmpi2(:,:,:,:,idat),2*md1*md2proc*nd3proc, & 6272 & zmpi1(:,:,:,:,idat),2*md1*md2proc*nd3proc,comm_fft,requests(idat)) 6273 call timab(543,2,tsec) 6274 end if 6275 end do 6276 6277 do idat=1,ndat 6278 if (nproc_fft>1) call xmpi_wait(requests(idat),ierr) 6279 ! Loop over the z treated by this node. 6280 do j3=1,nd3proc 6281 if (me_fft*nd3proc+j3 <= n3) then 6282 Jp2st=1; J2st=1 6283 6284 ! Loop over G_y in the small box. 6285 do j=1,m2eff,lot1 6286 ma=j 6287 mb=min(j+(lot1-1),m2eff) 6288 n1dfft=mb-ma+1 6289 6290 ! Zero-pad input. 6291 ! input: G1,G2,R3,JG2,(Rp3) 6292 ! output: G2,G1,R3,JG2,(Rp3) 6293 if (nproc_fft==1) then 6294 call mpiswitch_cent(j3,n1dfft,Jp2st,J2st,lot1,max1,md1,m1,n1,& 6295 & md2proc,nd3proc,nproc_fft,ioption,zmpi2(:,:,:,:,idat),zw,max2,m2,n2) 6296 else 6297 call mpiswitch_cent(j3,n1dfft,Jp2st,J2st,lot1,max1,md1,m1,n1,& 6298 & md2proc,nd3proc,nproc_fft,ioption,zmpi1(:,:,:,:,idat),zw,max2,m2,n2) 6299 end if 6300 6301 ! Transform along x 6302 ! input: G2,G1,R3,(Rp3) 6303 ! output: G2,R1,R3,(Rp3) 6304 if (n1dfft == lot1) then 6305 call dfftw_execute_dft(bw_plan1_lot, zw, zt(1,j,1)) 6306 else 6307 call dfftw_execute_dft(bw_plan1_rest, zw, zt(1,j,1)) 6308 end if 6309 6310 end do ! j 6311 6312 ! Transform along y axis (take into account c2c or c2r case). 6313 ! Must loop over the full box. 6314 do j=1,n1eff,lot2 6315 ma=j 6316 mb=min(j+(lot2-1),n1eff) 6317 n1dfft=mb-ma+1 6318 includelast=1 6319 6320 if (cplexwf==1) then 6321 jeff=2*j-1 6322 if (mb==n1eff .and. n1eff*2/=n1) includelast=0 6323 end if 6324 6325 ! Zero-pad the input. 6326 ! input: G2,R1,R3,(Rp3) 6327 ! output: R1,G2,R3,(Rp3) 6328 if (cplexwf==2) then 6329 call switch_cent(n1dfft,max2,m2,n2,lot2,n1,lzt,zt(1,1,j),zw) 6330 else 6331 call switchreal_cent(includelast,n1dfft,max2,n2,lot2,m1zt,lzt,zt(1,1,jeff),zw) 6332 end if 6333 6334 ! input: R1,G2,R3,(Rp3) 6335 ! output: R1,R2,R3,(Rp3) 6336 if (n1dfft == lot2) then 6337 call dfftw_execute_dft(bw_plan2_lot, zw, zr(1,j,1,j3,idat)) 6338 else 6339 call dfftw_execute_dft(bw_plan2_rest, zw, zr(1,j,1,j3,idat)) 6340 end if 6341 6342 end do 6343 6344 ! Treat real wavefunctions. 6345 if (cplexwf==1) then 6346 n1half=n1/2 6347 ! If odd 6348 if (n1half*2/=n1) then 6349 do i2=1,n2 6350 zr(1,n1,i2,j3,idat)=zr(1,n1eff,i2,j3,idat) 6351 zr(2,n1,i2,j3,idat)=zero 6352 end do 6353 end if 6354 do i2=1,n2 6355 do i1=n1half,1,-1 6356 zr(1,2*i1-1,i2,j3,idat)=zr(1,i1,i2,j3,idat) 6357 zr(1,2*i1 ,i2,j3,idat)=zr(2,i1,i2,j3,idat) 6358 zr(2,2*i1-1,i2,j3,idat)=zero 6359 zr(2,2*i1 ,i2,j3,idat)=zero 6360 end do 6361 end do 6362 end if 6363 6364 end if 6365 6366 end do ! j3 6367 end do ! idat 6368 6369 call dfftw_destroy_plan(bw_plan3_lot) 6370 if (mod(m1, lot3) /= 0) then 6371 call dfftw_destroy_plan(bw_plan3_rest) 6372 end if 6373 6374 call dfftw_destroy_plan(bw_plan1_lot) 6375 if (mod(m2eff, lot1) /= 0) then 6376 call dfftw_destroy_plan(bw_plan1_rest) 6377 end if 6378 6379 call dfftw_destroy_plan(bw_plan2_lot) 6380 if (mod(n1eff, lot2) /= 0) then 6381 call dfftw_destroy_plan(bw_plan2_rest) 6382 end if 6383 6384 ABI_FREE(zmpi2) 6385 ABI_FREE(zw) 6386 ABI_FREE(zt) 6387 if (nproc_fft>1) then 6388 ABI_FREE(zmpi1) 6389 end if 6390 6391 #else 6392 ABI_ERROR("FFTW3 support not activated") 6393 ABI_UNUSED((/cplexwf,ndat,n1,n2,n3,nd1,nd2,nd3proc/)) 6394 ABI_UNUSED((/ max1,max2,max3,m1,m2,m3,md1,md2proc,md3,comm_fft/)) 6395 ABI_UNUSED((/zf(1,1,1,1,1),zr(1,1,1,1,1)/)) 6396 #endif 6397 6398 end subroutine fftw3_mpiback_manywf
m_fftw3/fftw3_mpiback_wf [ Functions ]
[ Top ] [ m_fftw3 ] [ Functions ]
NAME
fftw3_mpiback_wf
FUNCTION
Does multiple 3-dim backward FFTs from Fourier into real space Adopt standard convention that isign=1 for backward transform CALCULATES THE DISCRETE FOURIER TRANSFORM ZF(I1,I2,I3)= S_(j1,j2,j3) EXP(isign*i*2*pi*(j1*i1/n1+j2*i2/n2+j3*i3/n3)) ZF(j1,j3,j2) in parallel using MPI/OpenMP. INPUTS: cplexwf=1 if wavefunction is real, 2 if complex ndat=Number of wavefunctions to transform. n1,n2,n3: logical dimension of the transform. As transform lengths most products of the prime factors 2,3,5 are allowed. The detailed table with allowed transform lengths can be found in subroutine CTRIG nd1,nd2,nd3: Leading Dimension of ZR nd3proc=((nd3-1)/nproc_fft)+1 maximal number of big box 3rd dim slices for one proc max1 is positive or zero; m1 >=max1+1 i1= 1... max1+1 corresponds to positive and zero wavevectors 0 ... max1 then, if m1 > max1+1, one has min1=max1-m1+1 and i1= max1+2 ... m1 corresponds to negative wavevectors min1 ... -1 max2 and max3 have a similar definition of range m1,m2,m3=Size of the box enclosing the G-sphere. md1,md2,md3: Dimension of ZF given on the **small** FFT box. md2proc=((md2-1)/nproc_fft)+1 maximal number of small box 2nd dim slices for one proc nproc_fft: number of processors used as returned by MPI_COMM_SIZE comm_fft=MPI communicator for the FFT. ZF: input array (note the switch of i2 and i3) real(F(i1,i3,i2,idat))=ZF(1,i1,i3,i2,idat) imag(F(i1,i3,i2,idat))=ZF(2,i1,i3,i2,idat) OUTPUTS ZR: output array ZR(1,i1,i2,i3,idat)=real(R(i1,i2,i3,idat)) ZR(2,i1,i2,i3,idat)=imag(R(i1,i2,i3,idat)) i1=1,n1 , i2=1,n2 , i3=1,n3 , idat=1,ndat
NOTES
The maximum number of processors that can reasonably be used is max(n2/2,n3/2) It is very important to find the optimal value of NCACHE. NCACHE determines the size of the work array ZW, that has to fit into cache. It has therefore to be chosen to equal roughly half the size of the physical cache in units of real*8 numbers. The optimal value of ncache can easily be determined by numerical experimentation. A too large value of ncache leads to a dramatic and sudden decrease of performance, a too small value to a to a slow and less dramatic decrease of performance. If NCACHE is set to a value so small, that not even a single one dimensional transform can be done in the workarray zw, the program stops with an error message.
SOURCE
3970 subroutine fftw3_mpiback_wf(cplexwf,ndat,n1,n2,n3,nd1,nd2,nd3proc,& 3971 & max1,max2,max3,m1,m2,m3,md1,md2proc,md3,zf,zr,comm_fft) 3972 3973 !Arguments ------------------------------------ 3974 integer,intent(in) :: cplexwf,ndat,n1,n2,n3,nd1,nd2,nd3proc 3975 integer,intent(in) :: max1,max2,max3,m1,m2,m3,md1,md2proc,md3,comm_fft 3976 real(dp),intent(in) :: zf(2,md1,md3,md2proc,ndat) 3977 real(dp),intent(out) :: zr(2,nd1,nd2,nd3proc,ndat) 3978 3979 #ifdef HAVE_FFTW3 3980 !Local variables------------------------------- 3981 integer,parameter :: nt1=1 3982 integer :: j,i1,i2,idat,ierr,includelast 3983 integer :: ioption,j2,j3,j2st,jp2st,jeff,lzt,m1zt,ma,mb,n1dfft,nnd3 3984 integer :: lot1,lot2,lot3 3985 integer :: m2eff,ncache,n1eff,n1half,nproc_fft,me_fft,nthreads 3986 integer(KIND_FFTW_PLAN) :: bw_plan1_lot,bw_plan1_rest 3987 integer(KIND_FFTW_PLAN) :: bw_plan2_lot,bw_plan2_rest 3988 integer(KIND_FFTW_PLAN) :: bw_plan3_lot,bw_plan3_rest 3989 !type(C_PTR) :: zw_cptr,zt_cptr 3990 character(len=500) :: msg 3991 !arrays 3992 real(dp),allocatable :: zmpi1(:,:,:,:),zmpi2(:,:,:,:) ! work arrays for MPI 3993 real(dp),allocatable :: zw(:,:),zt(:,:,:) ! cache work array and array for transpositions 3994 !real(dp),ABI_CONTIGUOUS pointer :: zw(:,:),zt(:,:,:) 3995 ! FFT work arrays 3996 real(dp) :: tsec(2) 3997 3998 ! ************************************************************************* 3999 4000 !call wrtout(std_out,"mpiback standard ALLTOALL + FFTW3") 4001 4002 ! FIXME must provide a default value but which one? 4003 ! ioption = 0 4004 ioption = 1 4005 !if (paral_kgb==1) ioption=1 4006 4007 nproc_fft = xmpi_comm_size(comm_fft); me_fft = xmpi_comm_rank(comm_fft) 4008 4009 ! Find cache size that gives optimal performance on machine 4010 ncache=2*max(n1,n2,n3,1024) 4011 if (ncache/(2*max(n1,n2,n3))<1) then 4012 write(msg,"(5a)") & 4013 & 'ncache has to be enlarged to be able to hold at',ch10, & 4014 & 'least one 1-d FFT of each size even though this will',ch10,& 4015 & 'reduce the performance for shorter transform lengths' 4016 ABI_ERROR(msg) 4017 end if 4018 4019 ! Effective m1 and m2 (complex-to-complex or real-to-complex) 4020 n1eff=n1; m2eff=m2; m1zt=n1 4021 if (cplexwf==1) then 4022 n1eff=(n1+1)/2; m2eff=m2/2+1; m1zt=2*(n1/2+1) 4023 end if 4024 4025 lzt=m2eff 4026 if (mod(m2eff,2)==0) lzt=lzt+1 4027 if (mod(m2eff,4)==0) lzt=lzt+1 4028 4029 ! maximal number of big box 3rd dim slices for all procs 4030 nnd3=nd3proc*nproc_fft 4031 4032 ! Allocate cache work array and work arrays for MPI transpositions. 4033 ABI_MALLOC(zw,(2,ncache/2)) 4034 ABI_MALLOC(zt,(2,lzt,m1zt)) 4035 4036 !call fftw3_alloc_real([2,ncache/2],zw_cptr,zw) 4037 !call fftw3_alloc_real([2,lzt,m1zt],zt_cptr,zt) 4038 4039 ABI_MALLOC(zmpi2,(2,md1,md2proc,nnd3)) 4040 if (nproc_fft>1) then 4041 ABI_MALLOC(zmpi1,(2,md1,md2proc,nnd3)) 4042 end if 4043 4044 !DEBUG 4045 ! write(std_out,'(2a,3i4)' )itoa(me_fft),': fftw3_mpiback_wf,zf n1,n2,n3',n1,n2,n3 4046 ! write(std_out,'(2a,3i4)' )itoa(me_fft),': nd1,nd2,nd3proc',nd1,nd2,nd3proc 4047 ! write(std_out,'(2a,3i4)' )itoa(me_fft),': m1,m2,m3',m1,m2,m3 4048 ! write(std_out,'(2a,3i4)' )itoa(me_fft),': max1,max2,max3',max1,max2,max3 4049 ! write(std_out,'(2a,3i4)' )itoa(me_fft),': md1,md2proc,md3',md1,md2proc,md3 4050 ! write(std_out,'(2a,3i4)' )itoa(me_fft),'n1eff,m2eff,m1zt',n1eff,m2eff,m1zt 4051 !ENDDEBUG 4052 4053 ! Create plans. 4054 ! The prototype for sfftw_plan_many_dft is: 4055 ! sfftw_plan_many_dft(rank, n, howmany, 4056 ! fin, iembed, istride, idist, 4057 ! fout, oembed, ostride, odist, isign, my_flags) 4058 4059 lot3=ncache/(2*n3) 4060 lot1=ncache/(2*n1) 4061 lot2=ncache/(2*n2) 4062 4063 nthreads = xomp_get_num_threads(open_parallel=.TRUE.) 4064 !nthreads = 1 4065 4066 bw_plan3_lot = dplan_many_dft_2D(1, [n3], lot3, & 4067 & zw, [ncache/2], lot3, 1, & 4068 & zw, [ncache/2], lot3, 1, ABI_FFTW_BACKWARD, ABI_FFTW_ESTIMATE, nthreads) 4069 4070 if (mod(m1, lot3) /= 0) then 4071 bw_plan3_rest = dplan_many_dft_2D(1, [n3], mod(m1, lot3), & 4072 & zw, [ncache/2], lot3, 1, & 4073 & zw, [ncache/2], lot3, 1, ABI_FFTW_BACKWARD, ABI_FFTW_ESTIMATE, nthreads) 4074 end if 4075 4076 bw_plan1_lot = dplan_many_dft_2D(1, [n1], lot1, & 4077 & zw, [ncache/2], lot1, 1, & 4078 & zt, [lzt, m1zt], lzt, 1, ABI_FFTW_BACKWARD, ABI_FFTW_ESTIMATE, nthreads) 4079 4080 if (mod(m2eff, lot1) /= 0) then 4081 bw_plan1_rest = dplan_many_dft_2D(1, [n1], mod(m2eff, lot1), & 4082 & zw, [ncache/2], lot1, 1, & 4083 & zt, [lzt, m1zt], lzt, 1, ABI_FFTW_BACKWARD, ABI_FFTW_ESTIMATE, nthreads) 4084 end if 4085 4086 bw_plan2_lot = dplan_many_dft_2D(1, [n2], lot2, & 4087 & zw, [ncache/2], lot2, 1, & 4088 & zr, [nd1,nd2,nd3proc,ndat], nd1, 1, ABI_FFTW_BACKWARD, ABI_FFTW_ESTIMATE, nthreads) 4089 4090 if (mod(n1eff, lot2) /= 0) then 4091 bw_plan2_rest = dplan_many_dft_2D(1, [n2], mod(n1eff,lot2), & 4092 & zw, [ncache/2], lot2, 1, & 4093 & zr, [nd1,nd2,nd3proc,ndat], nd1, 1, ABI_FFTW_BACKWARD, ABI_FFTW_ESTIMATE, nthreads) 4094 end if 4095 4096 do idat=1,ndat 4097 ! transform along z axis 4098 ! input: G1,G3,G2,(Gp2) 4099 4100 ! Loop over the y planes treated by this node and trasform n1ddft G_z lines. 4101 do j2=1,md2proc 4102 ! if (me_fft*md2proc+j2<=m2eff) then !a faire plus tard 4103 do i1=1,m1,lot3 4104 ma=i1 4105 mb=min(i1+(lot3-1),m1) 4106 n1dfft=mb-ma+1 4107 4108 ! zero-pad n1dfft G_z lines 4109 ! input: G1,G3,G2,(Gp2) 4110 ! output: G1,R3,G2,(Gp2) 4111 call fill_cent(md1,md3,lot3,n1dfft,max3,m3,n3,zf(1,i1,1,j2,idat),zw) 4112 4113 ! Transform along z. 4114 if (n1dfft == lot3) then 4115 call dfftw_execute_dft(bw_plan3_lot, zw, zw) 4116 else 4117 call dfftw_execute_dft(bw_plan3_rest, zw, zw) 4118 end if 4119 4120 ! Local rotation. 4121 ! input: G1,R3,G2,(Gp2) 4122 ! output: G1,G2,R3,(Gp2) 4123 call scramble(i1,j2,lot3,n1dfft,md1,n3,md2proc,nnd3,zw,zmpi2) 4124 end do 4125 end do ! j2 4126 4127 ! Interprocessor data transposition 4128 ! input: G1,G2,R3,Rp3,(Gp2) 4129 ! output: G1,G2,R3,Gp2,(Rp3) 4130 if (nproc_fft>1) then 4131 call timab(543,1,tsec) 4132 call xmpi_alltoall(zmpi2,2*md1*md2proc*nd3proc, & 4133 & zmpi1,2*md1*md2proc*nd3proc,comm_fft,ierr) 4134 call timab(543,2,tsec) 4135 end if 4136 4137 ! Loop over the z treated by this node. 4138 do j3=1,nd3proc 4139 if (me_fft*nd3proc+j3 <= n3) then 4140 Jp2st=1; J2st=1 4141 4142 ! Loop over G_y in the small box. 4143 do j=1,m2eff,lot1 4144 ma=j 4145 mb=min(j+(lot1-1),m2eff) 4146 n1dfft=mb-ma+1 4147 4148 ! Zero-pad input. 4149 ! input: G1,G2,R3,JG2,(Rp3) 4150 ! output: G2,G1,R3,JG2,(Rp3) 4151 if (nproc_fft==1) then 4152 call mpiswitch_cent(j3,n1dfft,Jp2st,J2st,lot1,max1,md1,m1,n1,& 4153 & md2proc,nd3proc,nproc_fft,ioption,zmpi2,zw,max2,m2,n2) 4154 else 4155 call mpiswitch_cent(j3,n1dfft,Jp2st,J2st,lot1,max1,md1,m1,n1,& 4156 & md2proc,nd3proc,nproc_fft,ioption,zmpi1,zw,max2,m2,n2) 4157 end if 4158 4159 ! Transform along x 4160 ! input: G2,G1,R3,(Rp3) 4161 ! output: G2,R1,R3,(Rp3) 4162 if (n1dfft == lot1) then 4163 call dfftw_execute_dft(bw_plan1_lot, zw, zt(1,j,1)) 4164 else 4165 call dfftw_execute_dft(bw_plan1_rest, zw, zt(1,j,1)) 4166 end if 4167 4168 end do ! j 4169 4170 ! Transform along y axis (take into account c2c or c2r case). 4171 ! Must loop over the full box. 4172 do j=1,n1eff,lot2 4173 ma=j 4174 mb=min(j+(lot2-1),n1eff) 4175 n1dfft=mb-ma+1 4176 includelast=1 4177 4178 if (cplexwf==1) then 4179 jeff=2*j-1 4180 if (mb==n1eff .and. n1eff*2/=n1) includelast=0 4181 end if 4182 4183 ! Zero-pad the input. 4184 ! input: G2,R1,R3,(Rp3) 4185 ! output: R1,G2,R3,(Rp3) 4186 if (cplexwf==2) then 4187 call switch_cent(n1dfft,max2,m2,n2,lot2,n1,lzt,zt(1,1,j),zw) 4188 else 4189 call switchreal_cent(includelast,n1dfft,max2,n2,lot2,m1zt,lzt,zt(1,1,jeff),zw) 4190 end if 4191 4192 ! input: R1,G2,R3,(Rp3) 4193 ! output: R1,R2,R3,(Rp3) 4194 if (n1dfft == lot2) then 4195 call dfftw_execute_dft(bw_plan2_lot, zw, zr(1,j,1,j3,idat)) 4196 else 4197 call dfftw_execute_dft(bw_plan2_rest, zw, zr(1,j,1,j3,idat)) 4198 end if 4199 4200 end do 4201 4202 ! Treat real wavefunctions. 4203 if (cplexwf==1) then 4204 n1half=n1/2 4205 ! If odd 4206 if (n1half*2/=n1) then 4207 do i2=1,n2 4208 zr(1,n1,i2,j3,idat)=zr(1,n1eff,i2,j3,idat) 4209 zr(2,n1,i2,j3,idat)=zero 4210 end do 4211 end if 4212 do i2=1,n2 4213 do i1=n1half,1,-1 4214 zr(1,2*i1-1,i2,j3,idat)=zr(1,i1,i2,j3,idat) 4215 zr(1,2*i1 ,i2,j3,idat)=zr(2,i1,i2,j3,idat) 4216 zr(2,2*i1-1,i2,j3,idat)=zero 4217 zr(2,2*i1 ,i2,j3,idat)=zero 4218 end do 4219 end do 4220 end if 4221 4222 end if 4223 end do ! j3 4224 end do ! idat 4225 4226 call dfftw_destroy_plan(bw_plan3_lot) 4227 if (mod(m1, lot3) /= 0) then 4228 call dfftw_destroy_plan(bw_plan3_rest) 4229 end if 4230 4231 call dfftw_destroy_plan(bw_plan1_lot) 4232 if (mod(m2eff, lot1) /= 0) then 4233 call dfftw_destroy_plan(bw_plan1_rest) 4234 end if 4235 4236 call dfftw_destroy_plan(bw_plan2_lot) 4237 if (mod(n1eff, lot2) /= 0) then 4238 call dfftw_destroy_plan(bw_plan2_rest) 4239 end if 4240 4241 ABI_FREE(zmpi2) 4242 ABI_FREE(zw) 4243 ABI_FREE(zt) 4244 if (nproc_fft>1) then 4245 ABI_FREE(zmpi1) 4246 end if 4247 4248 #else 4249 ABI_ERROR("FFTW3 support not activated") 4250 ABI_UNUSED((/cplexwf,ndat,n1,n2,n3,nd1,nd2,nd3proc/)) 4251 ABI_UNUSED((/ max1,max2,max3,m1,m2,m3,md1,md2proc,md3,comm_fft/)) 4252 ABI_UNUSED((/zf(1,1,1,1,1),zr(1,1,1,1,1)/)) 4253 #endif 4254 4255 end subroutine fftw3_mpiback_wf
m_fftw3/fftw3_mpiforw [ Functions ]
[ Top ] [ m_fftw3 ] [ Functions ]
NAME
fftw3_mpiforw
FUNCTION
Adopt standard convention that isign=-1 for forward transform CALCULATES THE DISCRETE FOURIERTRANSFORM ZF(I1,I3,I2)= S_(j1,j2,j3) EXP(isign*i*2*pi*(j1*i1/n1+j2*i2/n2+j3*i3/n3)) ZR(j1,j2,j3) in parallel using MPI/OpenMP and BLAS library calls.
INPUTS
ZR: input array ZR(1,i1,i2,i3,idat)=real(R(i1,i2,i3,idat)) ZR(2,i1,i2,i3,idat)=imag(R(i1,i2,i3,idat)) i1=1,n1 , i2=1,n2 , i3=1,n3 , idat=1,ndat OUTPUTS ZF: output array (note the switch of i2 and i3) real(F(i1,i3,i2,idat))=ZF(1,i1,i3,i2,idat) imag(F(i1,i3,i2,idat))=ZF(2,i1,i3,i2,idat) i1=1,n1 , i2=1,n2 , i3=1,n3 , idat=1,ndat nproc_fft: number of processors used as returned by MPI_COMM_SIZE me_fft: [0:nproc_fft-1] number of processor as returned by MPI_COMM_RANK n1,n2,n3: logical dimension of the transform. As transform lengths most products of the prime factors 2,3,5 are allowed. The detailed table with allowed transform lengths can be found in subroutine CTRIG nd1,nd2,nd3: Dimension of ZR and ZF nd2proc=((nd2-1)/nproc_fft)+1 maximal number of 2nd dim slices nd3proc=((nd3-1)/nproc_fft)+1 maximal number of 3rd dim slices
NOTES
SHOULD describe nd1eff SHOULD put cplex and nd1eff in OMP declarations SHOULD describe the change of value of nd2prod The maximum number of processors that can reasonably be used is max(n2,n3) It is very important to find the optimal value of NCACHE. NCACHE determines the size of the work array ZW, that has to fit into cache. It has therefore to be chosen to equal roughly half the size of the physical cache in units of real*8 numbers. The optimal value of ncache can easily be determined by numerical experimentation. A too large value of ncache leads to a dramatic and sudden decrease of performance, a too small value to a to a slow and less dramatic decrease of performance. If NCACHE is set to a value so small, that not even a single one dimensional transform can be done in the workarray zw, the program stops with an error message.
SOURCE
4957 subroutine fftw3_mpiforw(cplex,ndat,n1,n2,n3,nd1,nd2,nd3,nd1eff,nd2proc,nd3proc,option,zr,zf,comm_fft) 4958 4959 !Arguments ------------------------------------ 4960 !scalars 4961 integer,intent(in) :: cplex,comm_fft 4962 integer,intent(in) :: ndat,n1,n2,n3,nd1,nd2,nd3,nd1eff,nd2proc,nd3proc,option 4963 !arrays 4964 real(dp),intent(in) :: zr(2,nd1eff,nd2,nd3proc,ndat) 4965 real(dp),intent(out) :: zf(2,nd1,nd3,nd2proc,ndat) 4966 4967 !Local variables------------------------------- 4968 !scalars 4969 #ifdef HAVE_FFTW3 4970 integer :: j,i1,idat,ierr,j2,j2st,j3,jp2st,lzt,nthreads 4971 integer :: ma,mb,n1dfft,n1eff,n2eff,n1zt,ncache,nnd3,nproc_fft,me_fft,lot1,lot2,lot3 4972 integer(KIND_FFTW_PLAN) :: fw_plan1_lot,fw_plan1_rest 4973 integer(KIND_FFTW_PLAN) :: fw_plan2_lot,fw_plan2_rest 4974 integer(KIND_FFTW_PLAN) :: fw_plan3_lot,fw_plan3_rest 4975 character(len=500) :: msg 4976 !arrays 4977 real(dp), allocatable :: zmpi1(:,:,:,:),zmpi2(:,:,:,:) ! work arrays for MPI 4978 real(dp),allocatable :: zw(:,:),zt(:,:,:) ! cache work array and array for transpositions 4979 4980 ! ************************************************************************* 4981 4982 nproc_fft = xmpi_comm_size(comm_fft); me_fft = xmpi_comm_rank(comm_fft) 4983 4984 ! find cache size that gives optimal performance on machine 4985 ncache=2*max(n1,n2,n3,1024) 4986 if (ncache/(2*max(n1,n2,n3))<1) then 4987 write(msg,'(5a)')& 4988 & 'ncache has to be enlarged to be able to hold at',ch10, & 4989 & 'least one 1-d FFT of each size even though this will',ch10,& 4990 & 'reduce the performance for shorter transform lengths' 4991 ABI_ERROR(msg) 4992 end if 4993 4994 ! check input 4995 if (nd1<n1 .or. nd2<n2 .or. nd3<n3) then 4996 ABI_ERROR("forw: assertion error nd1<n1 .or. nd2<n2 .or. nd3<n3") 4997 end if 4998 4999 !Effective n1 and n2 (complex-to-complex or real-to-complex) 5000 n1eff=n1; n2eff=n2; n1zt=n1 5001 if (cplex==1) then 5002 n1eff=(n1+1)/2; n2eff=n2/2+1; n1zt=2*(n1/2+1) 5003 end if 5004 5005 lzt=n2eff 5006 if (mod(n2eff,2) == 0) lzt=lzt+1 5007 if (mod(n2eff,4) == 0) lzt=lzt+1 5008 5009 ! maximal number of big box 3rd dim slices for all procs 5010 nnd3=nd3proc*nproc_fft 5011 5012 ABI_MALLOC(zw,(2,ncache/2)) 5013 ABI_MALLOC(zt,(2,lzt,n1zt)) 5014 ABI_MALLOC(zmpi2,(2,n1,nd2proc,nnd3)) 5015 if (nproc_fft>1) then 5016 ABI_MALLOC(zmpi1,(2,n1,nd2proc,nnd3)) 5017 end if 5018 5019 ! Create plans. 5020 ! The prototype for sfftw_plan_many_dft is: 5021 ! sfftw_plan_many_dft(rank, n, howmany, 5022 ! fin, iembed, istride, idist, 5023 ! fout, oembed, ostride, odist, isign, my_flags) 5024 5025 lot1=ncache/(2*n1) 5026 lot2=ncache/(2*n2) 5027 lot3=ncache/(2*n3) 5028 5029 nthreads = xomp_get_num_threads(open_parallel=.TRUE.) 5030 !nthreads = 1 5031 5032 fw_plan3_lot = dplan_many_dft_2D(1, [n3], lot3, & 5033 & zw, [ncache/2], lot3, 1, & 5034 & zw, [ncache/2], lot3, 1, ABI_FFTW_FORWARD, ABI_FFTW_ESTIMATE, nthreads) 5035 5036 if (mod(n1, lot3) /= 0) then 5037 fw_plan3_rest = dplan_many_dft_2D(1, [n3], mod(n1, lot3), & 5038 & zw, [ncache/2], lot3, 1, & 5039 & zw, [ncache/2], lot3, 1, ABI_FFTW_FORWARD, ABI_FFTW_ESTIMATE, nthreads) 5040 end if 5041 5042 fw_plan1_lot = dplan_many_dft_2D(1, [n1], lot1, & 5043 & zt, [lzt, n1zt], lzt, 1, & 5044 & zw, [ncache/2], lot1, 1, ABI_FFTW_FORWARD, ABI_FFTW_ESTIMATE, nthreads) 5045 5046 if (mod(n2eff, lot1) /= 0) then 5047 fw_plan1_rest = dplan_many_dft_2D(1, [n1], mod(n2eff, lot1), & 5048 & zt, [lzt, n1zt], lzt, 1, & 5049 & zw, [ncache/2], lot1, 1, ABI_FFTW_FORWARD, ABI_FFTW_ESTIMATE, nthreads) 5050 end if 5051 5052 fw_plan2_lot = dplan_many_dft_2D(1, [n2], lot2, & 5053 & zr, [nd1eff,nd2,nd3proc,ndat], nd1eff, 1, & 5054 & zw, [ncache/2], lot2, 1, ABI_FFTW_FORWARD, ABI_FFTW_ESTIMATE, nthreads) 5055 5056 if (mod(n1eff, lot2) /= 0) then 5057 fw_plan2_rest = dplan_many_dft_2D(1, [n2], mod(n1eff,lot2), & 5058 & zr, [nd1eff,nd2,nd3proc,ndat], nd1eff, 1, & 5059 & zw, [ncache/2], lot2, 1, ABI_FFTW_FORWARD, ABI_FFTW_ESTIMATE, nthreads) 5060 end if 5061 5062 do idat=1,ndat 5063 5064 do j3=1,nd3proc 5065 if (me_fft*(nd3proc)+j3 <= n3) then 5066 Jp2st=1; J2st=1 5067 5068 ! transform along y axis 5069 ! input: R1,R2,R3,(Rp3) 5070 do j=1,n1eff,lot2 5071 ma=j 5072 mb=min(j+(lot2-1),n1eff) 5073 n1dfft=mb-ma+1 5074 5075 if (n1dfft == lot2) then 5076 call dfftw_execute_dft(fw_plan2_lot, zr(1,j,1,j3,idat), zw) 5077 else 5078 call dfftw_execute_dft(fw_plan2_rest, zr(1,j,1,j3,idat), zw) 5079 end if 5080 5081 ! input: R1,G2,R3,(Rp3) 5082 ! output: G2,R1,R3,(Rp3) 5083 if (cplex==2) then 5084 call unswitch(n1dfft,n2,lot2,n1zt,lzt,zw,zt(1,1,j)) 5085 else 5086 call unswitchreal(n1dfft,n2,n2eff,lot2,n1zt,lzt,zw,zt(1,1,2*j-1)) 5087 end if 5088 end do 5089 5090 ! transform along x axis 5091 ! input: G2,R1,R3,(Rp3) 5092 do j=1,n2eff,lot1 5093 ma=j 5094 mb=min(j+(lot1-1),n2eff) 5095 n1dfft=mb-ma+1 5096 5097 if (n1dfft == lot1) then 5098 call dfftw_execute_dft(fw_plan1_lot, zt(1,j,1), zw) 5099 else 5100 call dfftw_execute_dft(fw_plan1_rest, zt(1,j,1), zw) 5101 end if 5102 5103 ! input: G2,G1,R3,Gp2,(Rp3) 5104 ! output: G1,G2,R3,Gp2,(Rp3) 5105 ! write(std_out,*) 'J2st,Jp2st',J2st,Jp2st 5106 if (nproc_fft == 1) then 5107 call unmpiswitch(j3,n1dfft,Jp2st,J2st,lot1,n1,nd2proc,nd3proc,nproc_fft,option,zw,zmpi2) 5108 else 5109 call unmpiswitch(j3,n1dfft,Jp2st,J2st,lot1,n1,nd2proc,nd3proc,nproc_fft,option,zw,zmpi1) 5110 end if 5111 end do 5112 5113 end if 5114 end do ! j3 5115 5116 ! Interprocessor data transposition 5117 ! input: G1,G2,R3,Gp2,(Rp3) 5118 ! output: G1,G2,R3,Rp3,(Gp2) 5119 if (nproc_fft>1) then 5120 call xmpi_alltoall(zmpi1,2*n1*nd2proc*nd3proc, & 5121 & zmpi2,2*n1*nd2proc*nd3proc,comm_fft,ierr) 5122 end if 5123 5124 ! transform along z axis 5125 ! input: G1,G2,R3,(Gp2) 5126 5127 do j2=1,nd2proc 5128 if (me_fft*(nd2proc)+j2 <= n2eff) then 5129 do i1=1,n1,lot3 5130 ma=i1 5131 mb=min(i1+(lot3-1),n1) 5132 n1dfft=mb-ma+1 5133 5134 ! input: G1,G2,R3,(Gp2) 5135 ! output: G1,R3,G2,(Gp2) 5136 call unscramble(i1,j2,lot3,n1dfft,n1,n3,nd2proc,nd3,zmpi2,zw) 5137 5138 if (n1dfft == lot3) then 5139 call dfftw_execute_dft(fw_plan3_lot, zw, zw) 5140 else 5141 call dfftw_execute_dft(fw_plan3_rest, zw, zw) 5142 end if 5143 5144 call unfill(nd1,nd3,lot3,n1dfft,n3,zw,zf(1,i1,1,j2,idat)) 5145 ! output: G1,G3,G2,(Gp2) 5146 end do 5147 end if 5148 end do 5149 5150 end do ! idat 5151 5152 call dfftw_destroy_plan(fw_plan3_lot) 5153 if (mod(n1, lot3) /= 0) then 5154 call dfftw_destroy_plan(fw_plan3_rest) 5155 end if 5156 5157 call dfftw_destroy_plan(fw_plan1_lot) 5158 if (mod(n2eff, lot1) /= 0) then 5159 call dfftw_destroy_plan(fw_plan1_rest) 5160 end if 5161 5162 call dfftw_destroy_plan(fw_plan2_lot) 5163 if (mod(n1eff, lot2) /= 0) then 5164 call dfftw_destroy_plan(fw_plan2_rest) 5165 end if 5166 5167 ABI_FREE(zmpi2) 5168 ABI_FREE(zw) 5169 ABI_FREE(zt) 5170 if (nproc_fft>1) then 5171 ABI_FREE(zmpi1) 5172 end if 5173 5174 #else 5175 ABI_ERROR("FFTW3 support not activated") 5176 ABI_UNUSED((/cplex,ndat,n1,n2,n3,nd1,nd2,nd1eff,nd2proc,nd3proc,option,comm_fft/)) 5177 ABI_UNUSED((/zf(1,1,1,1,1),zr(1,1,1,1,1)/)) 5178 #endif 5179 5180 end subroutine fftw3_mpiforw
m_fftw3/fftw3_mpiforw_manywf [ Functions ]
[ Top ] [ m_fftw3 ] [ Functions ]
NAME
fftw3_mpiforw_manywf
FUNCTION
Does multiple 3-dim backward FFTs from real into Fourier space Adopt standard convention that isign=-1 for forward transform CALCULATES THE DISCRETE FOURIERTRANSFORM ZF(I1,I3,I2)=S_(j1,j2,j3) EXP(isign*i*2*pi*(j1*i1/n1+j2*i2/n2+j3*i3/n3)) ZR(j1,j2,j3) in parallel using MPI/OpenMP. INPUT: ZR: input array ZR(1,i1,i2,i3,idat)=real(R(i1,i2,i3,idat)) ZR(2,i1,i2,i3,idat)=imag(R(i1,i2,i3,idat)) i1=1,n1 , i2=1,n2 , i3=1,n3 , idat=1,ndat NOTE that ZR is changed by the routine n1,n2,n3: logical dimension of the transform. As transform lengths most products of the prime factors 2,3,5 are allowed. The detailed table with allowed transform lengths can be found in subroutine CTRIG nd1,nd2,nd3: Dimension of ZR nd3proc=((nd3-1)/nproc_fft)+1 maximal number of big box 3rd dim slices for one proc OUTPUT: ZF: output array (note the switch of i2 and i3) real(F(i1,i3,i2,idat))=ZF(1,i1,i3,i2,idat) imag(F(i1,i3,i2,idat))=ZF(2,i1,i3,i2,idat) max1 is positive or zero ; m1 >=max1+1 i1= 1... max1+1 corresponds to positive and zero wavevectors 0 ... max1 then, if m1 > max1+1, one has min1=max1-m1+1 and i1= max1+2 ... m1 corresponds to negative wavevectors min1 ... -1 i2 and i3 have a similar definition of range idat=1,ndat md1,md2,md3: Dimension of ZF md2proc=((md2-1)/nproc_fft)+1 maximal number of small box 2nd dim slices for one proc nproc_fft: number of processors used as returned by MPI_COMM_SIZE me_fft: [0:nproc-1] rank of the processor in the FFT communicator. comm_fft=MPI communicator for parallel FFT.
NOTES
The maximum number of processors that can reasonably be used is max(n2/2,n3/2) It is very important to find the optimal value of NCACHE. NCACHE determines the size of the work array ZW, that has to fit into cache. It has therefore to be chosen to equal roughly half the size of the physical cache in units of real*8 numbers. The optimal value of ncache can easily be determined by numerical experimentation. A too large value of ncache leads to a dramatic and sudden decrease of performance, a too small value to a to a slow and less dramatic decrease of performance. If NCACHE is set to a value so small, that not even a single one dimensional transform can be done in the workarray zw, the program stops with an error message.
SOURCE
6462 subroutine fftw3_mpiforw_manywf(cplexwf,ndat,n1,n2,n3,nd1,nd2,nd3proc,& 6463 & max1,max2,max3,m1,m2,m3,md1,md2proc,md3,zr,zf,comm_fft) 6464 6465 !Arguments ------------------------------------ 6466 !scalars 6467 integer,intent(in) :: cplexwf,ndat,n1,n2,n3,nd1,nd2,nd3proc 6468 integer,intent(in) :: max1,max2,max3,m1,m2,m3,md1,md2proc,md3,comm_fft 6469 !arrays 6470 real(dp),intent(inout) :: zr(2,nd1,nd2,nd3proc,ndat) 6471 real(dp),intent(out) :: zf(2,md1,md3,md2proc,ndat) 6472 6473 !Local variables------------------------------- 6474 !scalars 6475 #ifdef HAVE_FFTW3 6476 integer :: j,i1,i2,i3,idat,ierr,nproc_fft,me_fft 6477 integer :: ioption,j2,j3,j2st,jp2st,lot1,lot2,lot3,lzt,m1zt,ma,mb,n1dfft,nnd3 6478 integer :: m2eff,ncache,n1eff,n1half,i1inv,i2inv,i3inv,nthreads 6479 integer(KIND_FFTW_PLAN) :: fw_plan1_lot,fw_plan1_rest 6480 integer(KIND_FFTW_PLAN) :: fw_plan2_lot,fw_plan2_rest 6481 integer(KIND_FFTW_PLAN) :: fw_plan3_lot,fw_plan3_rest 6482 character(len=500) :: msg 6483 !arrays 6484 integer :: requests(ndat) 6485 real(dp) ABI_ASYNC, allocatable :: zmpi1(:,:,:,:,:),zmpi2(:,:,:,:,:) ! work arrays for MPI 6486 real(dp),allocatable :: zw(:,:),zt(:,:,:) ! cache work array and array for transpositions 6487 ! FFT work arrays 6488 real(dp) :: tsec(2) 6489 6490 ! ************************************************************************* 6491 6492 ! FIXME must provide a default value but which one? 6493 !ioption = 0 6494 ioption = 1 6495 !if (paral_kgb==1) ioption=1 6496 6497 nproc_fft = xmpi_comm_size(comm_fft); me_fft = xmpi_comm_rank(comm_fft) 6498 6499 ! find cache size that gives optimal performance on machine 6500 ncache=2*max(n1,n2,n3,1024) 6501 !ncache=2*max(n1,n2,n3,16*1024) 6502 6503 if (ncache/(2*max(n1,n2,n3))<1) then 6504 write(msg,'(5a)') & 6505 & 'ncache has to be enlarged to be able to hold at',ch10, & 6506 & 'least one 1-d FFT of each size even though this will',ch10,& 6507 & 'reduce the performance for shorter transform lengths' 6508 ABI_ERROR(msg) 6509 end if 6510 6511 ! Effective m1 and m2 (complex-to-complex or real-to-complex) 6512 n1eff=n1; m2eff=m2; m1zt=n1 6513 if (cplexwf==1) then 6514 n1eff=(n1+1)/2; m2eff=m2/2+1; m1zt=2*(n1/2+1) 6515 end if 6516 6517 lzt=m2eff 6518 if (mod(m2eff,2)==0) lzt=lzt+1 6519 if (mod(m2eff,4)==0) lzt=lzt+1 6520 6521 ! maximal number of big box 3rd dim slices for all procs 6522 nnd3=nd3proc*nproc_fft 6523 6524 ABI_MALLOC(zw,(2,ncache/2)) 6525 ABI_MALLOC(zt,(2,lzt,m1zt)) 6526 ABI_MALLOC(zmpi2,(2,md1,md2proc,nnd3,ndat)) 6527 if (nproc_fft>1) then 6528 ABI_MALLOC(zmpi1,(2,md1,md2proc,nnd3,ndat)) 6529 end if 6530 6531 ! Create plans. 6532 ! The prototype for sfftw_plan_many_dft is: 6533 ! sfftw_plan_many_dft(rank, n, howmany, 6534 ! fin, iembed, istride, idist, 6535 ! fout, oembed, ostride, odist, isign, my_flags) 6536 6537 lot2=ncache/(2*n2) 6538 lot1=ncache/(2*n1) 6539 lot3=ncache/(2*n3) 6540 6541 nthreads = xomp_get_num_threads(open_parallel=.TRUE.) 6542 !nthreads = 1 6543 6544 fw_plan3_lot = dplan_many_dft_2D(1, [n3], lot3, & 6545 & zw, [ncache/2], lot3, 1, & 6546 & zw, [ncache/2], lot3, 1, ABI_FFTW_FORWARD, ABI_FFTW_ESTIMATE, nthreads) 6547 6548 if (mod(m1, lot3) /= 0) then 6549 fw_plan3_rest = dplan_many_dft_2D(1, [n3], mod(m1, lot3), & 6550 & zw, [ncache/2], lot3, 1, & 6551 & zw, [ncache/2], lot3, 1, ABI_FFTW_FORWARD, ABI_FFTW_ESTIMATE, nthreads) 6552 end if 6553 6554 fw_plan1_lot = dplan_many_dft_2D(1, [n1], lot1, & 6555 & zt, [lzt, m1zt], lzt, 1, & 6556 & zw, [ncache/2], lot1, 1, ABI_FFTW_FORWARD, ABI_FFTW_ESTIMATE, nthreads) 6557 6558 if (mod(m2eff, lot1) /= 0) then 6559 fw_plan1_rest = dplan_many_dft_2D(1, [n1], mod(m2eff, lot1), & 6560 & zt, [lzt, m1zt], lzt, 1, & 6561 & zw, [ncache/2], lot1, 1, ABI_FFTW_FORWARD, ABI_FFTW_ESTIMATE, nthreads) 6562 end if 6563 6564 fw_plan2_lot = dplan_many_dft_2D(1, [n2], lot2, & 6565 & zr, [nd1,nd2,nd3proc,ndat], nd1, 1, & 6566 & zw, [ncache/2], lot2, 1, ABI_FFTW_FORWARD, ABI_FFTW_ESTIMATE, nthreads) 6567 6568 if (mod(n1eff, lot2) /= 0) then 6569 fw_plan2_rest = dplan_many_dft_2D(1, [n2], mod(n1eff,lot2), & 6570 & zr, [nd1,nd2,nd3proc,ndat], nd1, 1, & 6571 & zw, [ncache/2], lot2, 1, ABI_FFTW_FORWARD, ABI_FFTW_ESTIMATE, nthreads) 6572 end if 6573 6574 do idat=1,ndat 6575 ! Loop over the z-planes treated by this node 6576 do j3=1,nd3proc 6577 6578 if (me_fft*nd3proc+j3 <= n3) then 6579 Jp2st=1 6580 J2st=1 6581 6582 ! Treat real wavefunctions. 6583 if (cplexwf==1) then 6584 n1half=n1/2 6585 do i2=1,n2 6586 do i1=1,n1half 6587 zr(1,i1,i2,j3,idat)=zr(1,2*i1-1,i2,j3,idat) 6588 zr(2,i1,i2,j3,idat)=zr(1,2*i1 ,i2,j3,idat) 6589 end do 6590 end do 6591 ! If odd 6592 if(n1half*2/=n1)then 6593 do i2=1,n2 6594 zr(1,n1eff,i2,j3,idat)=zr(1,n1,i2,j3,idat) 6595 zr(2,n1eff,i2,j3,idat)=zero 6596 end do 6597 end if 6598 end if 6599 6600 ! transform along y axis 6601 ! input: R1,R2,R3,(Rp3) 6602 ! input: R1,G2,R3,(Rp3) 6603 do j=1,n1eff,lot2 6604 ma=j 6605 mb=min(j+(lot2-1),n1eff) 6606 n1dfft=mb-ma+1 6607 6608 if (n1dfft == lot2) then 6609 call dfftw_execute_dft(fw_plan2_lot, zr(1,j,1,j3,idat), zw) 6610 else 6611 call dfftw_execute_dft(fw_plan2_rest, zr(1,j,1,j3,idat), zw) 6612 end if 6613 6614 ! input: R1,G2,R3,(Rp3) 6615 ! output: G2,R1,R3,(Rp3) 6616 if (cplexwf==2) then 6617 call unswitch_cent(n1dfft,max2,m2,n2,lot2,n1,lzt,zw,zt(1,1,j)) 6618 else 6619 call unswitchreal_cent(n1dfft,max2,n2,lot2,n1,lzt,zw,zt(1,1,2*j-1)) 6620 end if 6621 end do 6622 6623 ! transform along x axis 6624 ! input: G2,R1,R3,(Rp3) 6625 do j=1,m2eff,lot1 6626 ma=j 6627 mb=min(j+(lot1-1),m2eff) 6628 n1dfft=mb-ma+1 6629 6630 if (n1dfft == lot1) then 6631 call dfftw_execute_dft(fw_plan1_lot, zt(1,j,1), zw) 6632 else 6633 call dfftw_execute_dft(fw_plan1_rest, zt(1,j,1), zw) 6634 end if 6635 ! output: G2,G1,R3,(Rp3) 6636 6637 ! input: G2,G1,R3,Gp2,(Rp3) 6638 ! output: G1,G2,R3,Gp2,(Rp3) 6639 if (nproc_fft==1) then 6640 call unmpiswitch_cent(j3,n1dfft,Jp2st,J2st,lot1,max1,md1,m1,n1,& 6641 & md2proc,nd3proc,nproc_fft,ioption,zw,zmpi2(:,:,:,:,idat)) 6642 else 6643 call unmpiswitch_cent(j3,n1dfft,Jp2st,J2st,lot1,max1,md1,m1,n1,& 6644 & md2proc,nd3proc,nproc_fft,ioption,zw,zmpi1(:,:,:,:,idat)) 6645 end if 6646 end do 6647 end if 6648 end do ! j3 6649 6650 ! Interprocessor data transposition 6651 ! input: G1,G2,R3,Gp2,(Rp3) 6652 ! output: G1,G2,R3,Rp3,(Gp2) 6653 if (nproc_fft>1) then 6654 call timab(544,1,tsec) 6655 call xmpi_ialltoall(zmpi1(:,:,:,:,idat),2*md1*md2proc*nd3proc, & 6656 & zmpi2(:,:,:,:,idat),2*md1*md2proc*nd3proc,comm_fft,requests(idat)) 6657 call timab(544,2,tsec) 6658 end if 6659 end do 6660 6661 do idat=1,ndat 6662 if (nproc_fft>1) call xmpi_wait(requests(idat),ierr) 6663 ! transform along z axis 6664 ! input: G1,G2,R3,(Gp2) 6665 6666 do j2=1,md2proc 6667 if (me_fft*md2proc+j2 <= m2eff) then 6668 ! write(std_out,*)' forwf_wf : before unscramble, j2,md2proc,me_fft,m2=',j2,md2proc,me_fft,m2 6669 do i1=1,m1,lot3 6670 ma=i1 6671 mb=min(i1+(lot3-1),m1) 6672 n1dfft=mb-ma+1 6673 6674 ! input: G1,G2,R3,(Gp2) 6675 ! output: G1,R3,G2,(Gp2) 6676 call unscramble(i1,j2,lot3,n1dfft,md1,n3,md2proc,nnd3,zmpi2(:,:,:,:,idat),zw) 6677 6678 if (n1dfft == lot3) then 6679 call dfftw_execute_dft(fw_plan3_lot, zw, zw) 6680 else 6681 call dfftw_execute_dft(fw_plan3_rest, zw, zw) 6682 end if 6683 6684 call unfill_cent(md1,md3,lot3,n1dfft,max3,m3,n3,zw,zf(1,i1,1,j2,idat)) 6685 ! output: G1,G3,G2,(Gp2) 6686 end do 6687 end if 6688 end do 6689 6690 if (cplexwf==1) then 6691 ! Complete missing values with complex conjugate 6692 ! Inverse of ix is located at nx+2-ix , except for ix=1, for which it is 1. 6693 do i3=1,m3 6694 i3inv=m3+2-i3 6695 if(i3==1)i3inv=1 6696 6697 if (m2eff>1) then 6698 do i2=2,m2eff 6699 i2inv=m2+2-i2 6700 zf(1,1,i3inv,i2inv,idat)= zf(1,1,i3,i2,idat) 6701 zf(2,1,i3inv,i2inv,idat)=-zf(2,1,i3,i2,idat) 6702 do i1=2,m1 6703 i1inv=m1+2-i1 6704 zf(1,i1inv,i3inv,i2inv,idat)= zf(1,i1,i3,i2,idat) 6705 zf(2,i1inv,i3inv,i2inv,idat)=-zf(2,i1,i3,i2,idat) 6706 end do 6707 end do 6708 end if 6709 end do 6710 end if 6711 6712 end do ! idat 6713 6714 call dfftw_destroy_plan(fw_plan3_lot) 6715 if (mod(m1, lot3) /= 0) then 6716 call dfftw_destroy_plan(fw_plan3_rest) 6717 end if 6718 6719 call dfftw_destroy_plan(fw_plan1_lot) 6720 if (mod(m2eff, lot1) /= 0) then 6721 call dfftw_destroy_plan(fw_plan1_rest) 6722 end if 6723 6724 call dfftw_destroy_plan(fw_plan2_lot) 6725 if (mod(n1eff, lot2) /= 0) then 6726 call dfftw_destroy_plan(fw_plan2_rest) 6727 end if 6728 6729 ABI_FREE(zmpi2) 6730 ABI_FREE(zw) 6731 ABI_FREE(zt) 6732 if (nproc_fft>1) then 6733 ABI_FREE(zmpi1) 6734 end if 6735 6736 #else 6737 ABI_ERROR("FFTW3 support not activated") 6738 ABI_UNUSED((/cplexwf,ndat,n1,n2,n3,nd1,nd2,nd3proc/)) 6739 ABI_UNUSED((/max1,max2,max3,m1,m2,m3,md1,md2proc,md3,comm_fft/)) 6740 ABI_UNUSED((/zf(1,1,1,1,1),zr(1,1,1,1,1)/)) 6741 #endif 6742 6743 end subroutine fftw3_mpiforw_manywf
m_fftw3/fftw3_mpiforw_wf [ Functions ]
[ Top ] [ m_fftw3 ] [ Functions ]
NAME
fftw3_mpiforw_wf
FUNCTION
Does multiple 3-dim backward FFTs from real into Fourier space Adopt standard convention that isign=-1 for forward transform CALCULATES THE DISCRETE FOURIERTRANSFORM ZF(I1,I3,I2)=S_(j1,j2,j3) EXP(isign*i*2*pi*(j1*i1/n1+j2*i2/n2+j3*i3/n3)) ZR(j1,j2,j3) in parallel using MPI/OpenMP. INPUT: ZR: input array ZR(1,i1,i2,i3,idat)=real(R(i1,i2,i3,idat)) ZR(2,i1,i2,i3,idat)=imag(R(i1,i2,i3,idat)) i1=1,n1 , i2=1,n2 , i3=1,n3 , idat=1,ndat NOTE that ZR is changed by the routine n1,n2,n3: logical dimension of the transform. As transform lengths most products of the prime factors 2,3,5 are allowed. The detailed table with allowed transform lengths can be found in subroutine CTRIG nd1,nd2,nd3: Dimension of ZR nd3proc=((nd3-1)/nproc_fft)+1 maximal number of big box 3rd dim slices for one proc OUTPUT: ZF: output array (note the switch of i2 and i3) real(F(i1,i3,i2,idat))=ZF(1,i1,i3,i2,idat) imag(F(i1,i3,i2,idat))=ZF(2,i1,i3,i2,idat) max1 is positive or zero ; m1 >=max1+1 i1= 1... max1+1 corresponds to positive and zero wavevectors 0 ... max1 then, if m1 > max1+1, one has min1=max1-m1+1 and i1= max1+2 ... m1 corresponds to negative wavevectors min1 ... -1 i2 and i3 have a similar definition of range idat=1,ndat md1,md2,md3: Dimension of ZF md2proc=((md2-1)/nproc_fft)+1 maximal number of small box 2nd dim slices for one proc nproc_fft: number of processors used as returned by MPI_COMM_SIZE me_fft: [0:nproc-1] rank of the processor in the FFT communicator. comm_fft=MPI communicator for parallel FFT.
NOTES
The maximum number of processors that can reasonably be used is max(n2/2,n3/2) It is very important to find the optimal value of NCACHE. NCACHE determines the size of the work array ZW, that has to fit into cache. It has therefore to be chosen to equal roughly half the size of the physical cache in units of real*8 numbers. The optimal value of ncache can easily be determined by numerical experimentation. A too large value of ncache leads to a dramatic and sudden decrease of performance, a too small value to a to a slow and less dramatic decrease of performance. If NCACHE is set to a value so small, that not even a single one dimensional transform can be done in the workarray zw, the program stops with an error message.
SOURCE
4319 subroutine fftw3_mpiforw_wf(cplexwf,ndat,n1,n2,n3,nd1,nd2,nd3proc,& 4320 & max1,max2,max3,m1,m2,m3,md1,md2proc,md3,zr,zf,comm_fft) 4321 4322 !Arguments ------------------------------------ 4323 !scalars 4324 integer,intent(in) :: cplexwf,ndat,n1,n2,n3,nd1,nd2,nd3proc 4325 integer,intent(in) :: max1,max2,max3,m1,m2,m3,md1,md2proc,md3,comm_fft 4326 !arrays 4327 real(dp),intent(inout) :: zr(2,nd1,nd2,nd3proc,ndat) 4328 real(dp),intent(out) :: zf(2,md1,md3,md2proc,ndat) 4329 4330 !Local variables------------------------------- 4331 !scalars 4332 #ifdef HAVE_FFTW3 4333 integer :: j,i1,i2,i3,idat,ierr,nproc_fft,me_fft,nthreads 4334 integer :: ioption,j2,j3,j2st,jp2st,lot1,lot2,lot3,lzt,m1zt,ma,mb,n1dfft,nnd3 4335 integer :: m2eff,ncache,n1eff,n1half,i1inv,i2inv,i3inv 4336 integer(KIND_FFTW_PLAN) :: fw_plan1_lot,fw_plan1_rest 4337 integer(KIND_FFTW_PLAN) :: fw_plan2_lot,fw_plan2_rest 4338 integer(KIND_FFTW_PLAN) :: fw_plan3_lot,fw_plan3_rest 4339 character(len=500) :: msg 4340 !arrays 4341 real(dp), allocatable :: zmpi1(:,:,:,:),zmpi2(:,:,:,:) ! work arrays for MPI 4342 real(dp),allocatable :: zw(:,:),zt(:,:,:) ! cache work array and array for transpositions 4343 ! FFT work arrays 4344 real(dp) :: tsec(2) 4345 4346 ! ************************************************************************* 4347 4348 ! FIXME must provide a default value but which one? 4349 !ioption = 0 4350 ioption = 1 4351 !if (paral_kgb==1) ioption=1 4352 4353 nproc_fft = xmpi_comm_size(comm_fft); me_fft = xmpi_comm_rank(comm_fft) 4354 4355 ! find cache size that gives optimal performance on machine 4356 ncache=2*max(n1,n2,n3,1024) 4357 !ncache=2*max(n1,n2,n3,16*1024) 4358 4359 if (ncache/(2*max(n1,n2,n3))<1) then 4360 write(msg,'(5a)') & 4361 & 'ncache has to be enlarged to be able to hold at',ch10, & 4362 & 'least one 1-d FFT of each size even though this will',ch10,& 4363 & 'reduce the performance for shorter transform lengths' 4364 ABI_ERROR(msg) 4365 end if 4366 4367 ! Effective m1 and m2 (complex-to-complex or real-to-complex) 4368 n1eff=n1; m2eff=m2; m1zt=n1 4369 if (cplexwf==1) then 4370 n1eff=(n1+1)/2; m2eff=m2/2+1; m1zt=2*(n1/2+1) 4371 end if 4372 4373 lzt=m2eff 4374 if (mod(m2eff,2)==0) lzt=lzt+1 4375 if (mod(m2eff,4)==0) lzt=lzt+1 4376 4377 ! maximal number of big box 3rd dim slices for all procs 4378 nnd3=nd3proc*nproc_fft 4379 4380 ABI_MALLOC(zw,(2,ncache/2)) 4381 ABI_MALLOC(zt,(2,lzt,m1zt)) 4382 ABI_MALLOC(zmpi2,(2,md1,md2proc,nnd3)) 4383 if (nproc_fft>1) then 4384 ABI_MALLOC(zmpi1,(2,md1,md2proc,nnd3)) 4385 end if 4386 4387 !DEBUG 4388 ! write(std_out,'(2a,3i4)' )itoa(me_fft),'fftw3_mpiforw_wf, enter, i1,i2,i3,zr,n1,n2,n3',n1,n2,n3 4389 ! write(std_out,'(2a,3i4)' )itoa(me_fft),'nd1,nd2,nd3proc',nd1,nd2,nd3proc 4390 ! write(std_out,'(2a,3i4)' )itoa(me_fft),'m1,m2,m3',m1,m2,m3 4391 ! write(std_out,'(2a,3i4)' )itoa(me_fft),'max1,max2,max3',max1,max2,max3 4392 ! write(std_out,'(2a,3i4)' )itoa(me_fft),'md1,md2proc,md3',md1,md2proc,md3 4393 ! write(std_out,'(2a,3i4)' )itoa(me_fft),'n1eff,m2eff,m1zt',n1eff,m2eff,m1zt 4394 !ENDDEBUG 4395 4396 ! Create plans. 4397 ! The prototype for sfftw_plan_many_dft is: 4398 ! sfftw_plan_many_dft(rank, n, howmany, 4399 ! fin, iembed, istride, idist, 4400 ! fout, oembed, ostride, odist, isign, my_flags) 4401 4402 lot2=ncache/(2*n2) 4403 lot1=ncache/(2*n1) 4404 lot3=ncache/(2*n3) 4405 4406 nthreads = xomp_get_num_threads(open_parallel=.TRUE.) 4407 !nthreads = 1 4408 4409 fw_plan3_lot = dplan_many_dft_2D(1, [n3], lot3, & 4410 & zw, [ncache/2], lot3, 1, & 4411 & zw, [ncache/2], lot3, 1, ABI_FFTW_FORWARD, ABI_FFTW_ESTIMATE,nthreads) 4412 4413 if (mod(m1, lot3) /= 0) then 4414 fw_plan3_rest = dplan_many_dft_2D(1, [n3], mod(m1, lot3), & 4415 & zw, [ncache/2], lot3, 1, & 4416 & zw, [ncache/2], lot3, 1, ABI_FFTW_FORWARD, ABI_FFTW_ESTIMATE, nthreads) 4417 end if 4418 4419 fw_plan1_lot = dplan_many_dft_2D(1, [n1], lot1, & 4420 & zt, [lzt, m1zt], lzt, 1, & 4421 & zw, [ncache/2], lot1, 1, ABI_FFTW_FORWARD, ABI_FFTW_ESTIMATE, nthreads) 4422 4423 if (mod(m2eff, lot1) /= 0) then 4424 fw_plan1_rest = dplan_many_dft_2D(1, [n1], mod(m2eff, lot1), & 4425 & zt, [lzt, m1zt], lzt, 1, & 4426 & zw, [ncache/2], lot1, 1, ABI_FFTW_FORWARD, ABI_FFTW_ESTIMATE, nthreads) 4427 end if 4428 4429 fw_plan2_lot = dplan_many_dft_2D(1, [n2], lot2, & 4430 & zr, [nd1,nd2,nd3proc,ndat], nd1, 1, & 4431 & zw, [ncache/2], lot2, 1, ABI_FFTW_FORWARD, ABI_FFTW_ESTIMATE, nthreads) 4432 4433 if (mod(n1eff, lot2) /= 0) then 4434 fw_plan2_rest = dplan_many_dft_2D(1, [n2], mod(n1eff,lot2), & 4435 & zr, [nd1,nd2,nd3proc,ndat], nd1, 1, & 4436 & zw, [ncache/2], lot2, 1, ABI_FFTW_FORWARD, ABI_FFTW_ESTIMATE, nthreads) 4437 end if 4438 4439 do idat=1,ndat 4440 ! Loop over the z-planes treated by this node 4441 do j3=1,nd3proc 4442 4443 if (me_fft*nd3proc+j3 <= n3) then 4444 Jp2st=1 4445 J2st=1 4446 4447 ! Treat real wavefunctions. 4448 if (cplexwf==1) then 4449 n1half=n1/2 4450 do i2=1,n2 4451 do i1=1,n1half 4452 zr(1,i1,i2,j3,idat)=zr(1,2*i1-1,i2,j3,idat) 4453 zr(2,i1,i2,j3,idat)=zr(1,2*i1 ,i2,j3,idat) 4454 end do 4455 end do 4456 ! If odd 4457 if(n1half*2/=n1)then 4458 do i2=1,n2 4459 zr(1,n1eff,i2,j3,idat)=zr(1,n1,i2,j3,idat) 4460 zr(2,n1eff,i2,j3,idat)=zero 4461 end do 4462 end if 4463 end if 4464 4465 ! transform along y axis 4466 ! input: R1,R2,R3,(Rp3) 4467 ! input: R1,G2,R3,(Rp3) 4468 do j=1,n1eff,lot2 4469 ma=j 4470 mb=min(j+(lot2-1),n1eff) 4471 n1dfft=mb-ma+1 4472 4473 if (n1dfft == lot2) then 4474 call dfftw_execute_dft(fw_plan2_lot, zr(1,j,1,j3,idat), zw) 4475 else 4476 call dfftw_execute_dft(fw_plan2_rest, zr(1,j,1,j3,idat), zw) 4477 end if 4478 4479 ! input: R1,G2,R3,(Rp3) 4480 ! output: G2,R1,R3,(Rp3) 4481 if (cplexwf==2) then 4482 call unswitch_cent(n1dfft,max2,m2,n2,lot2,n1,lzt,zw,zt(1,1,j)) 4483 else 4484 call unswitchreal_cent(n1dfft,max2,n2,lot2,n1,lzt,zw,zt(1,1,2*j-1)) 4485 end if 4486 end do 4487 4488 ! transform along x axis 4489 ! input: G2,R1,R3,(Rp3) 4490 do j=1,m2eff,lot1 4491 ma=j 4492 mb=min(j+(lot1-1),m2eff) 4493 n1dfft=mb-ma+1 4494 4495 if (n1dfft == lot1) then 4496 call dfftw_execute_dft(fw_plan1_lot, zt(1,j,1), zw) 4497 else 4498 call dfftw_execute_dft(fw_plan1_rest, zt(1,j,1), zw) 4499 end if 4500 ! output: G2,G1,R3,(Rp3) 4501 4502 ! input: G2,G1,R3,Gp2,(Rp3) 4503 ! output: G1,G2,R3,Gp2,(Rp3) 4504 if (nproc_fft==1) then 4505 call unmpiswitch_cent(j3,n1dfft,Jp2st,J2st,lot1,max1,md1,m1,n1,& 4506 & md2proc,nd3proc,nproc_fft,ioption,zw,zmpi2) 4507 else 4508 call unmpiswitch_cent(j3,n1dfft,Jp2st,J2st,lot1,max1,md1,m1,n1,& 4509 & md2proc,nd3proc,nproc_fft,ioption,zw,zmpi1) 4510 end if 4511 end do 4512 4513 end if 4514 end do ! j3 4515 4516 ! Interprocessor data transposition 4517 ! input: G1,G2,R3,Gp2,(Rp3) 4518 ! output: G1,G2,R3,Rp3,(Gp2) 4519 if (nproc_fft>1) then 4520 call timab(544,1,tsec) 4521 call xmpi_alltoall(zmpi1,2*md1*md2proc*nd3proc, & 4522 & zmpi2,2*md1*md2proc*nd3proc,comm_fft,ierr) 4523 call timab(544,2,tsec) 4524 end if 4525 4526 ! transform along z axis 4527 ! input: G1,G2,R3,(Gp2) 4528 4529 do j2=1,md2proc 4530 if (me_fft*md2proc+j2 <= m2eff) then 4531 ! write(std_out,*)' forwf_wf : before unscramble, j2,md2proc,me_fft,m2=',j2,md2proc,me_fft,m2 4532 do i1=1,m1,lot3 4533 ma=i1 4534 mb=min(i1+(lot3-1),m1) 4535 n1dfft=mb-ma+1 4536 4537 ! input: G1,G2,R3,(Gp2) 4538 ! output: G1,R3,G2,(Gp2) 4539 call unscramble(i1,j2,lot3,n1dfft,md1,n3,md2proc,nnd3,zmpi2,zw) 4540 4541 if (n1dfft == lot3) then 4542 call dfftw_execute_dft(fw_plan3_lot, zw, zw) 4543 else 4544 call dfftw_execute_dft(fw_plan3_rest, zw, zw) 4545 end if 4546 4547 call unfill_cent(md1,md3,lot3,n1dfft,max3,m3,n3,zw,zf(1,i1,1,j2,idat)) 4548 ! output: G1,G3,G2,(Gp2) 4549 end do 4550 end if 4551 end do 4552 4553 if (cplexwf==1) then 4554 ! Complete missing values with complex conjugate 4555 ! Inverse of ix is located at nx+2-ix , except for ix=1, for which it is 1. 4556 do i3=1,m3 4557 i3inv=m3+2-i3 4558 if(i3==1)i3inv=1 4559 4560 if (m2eff>1) then 4561 do i2=2,m2eff 4562 i2inv=m2+2-i2 4563 zf(1,1,i3inv,i2inv,idat)= zf(1,1,i3,i2,idat) 4564 zf(2,1,i3inv,i2inv,idat)=-zf(2,1,i3,i2,idat) 4565 do i1=2,m1 4566 i1inv=m1+2-i1 4567 zf(1,i1inv,i3inv,i2inv,idat)= zf(1,i1,i3,i2,idat) 4568 zf(2,i1inv,i3inv,i2inv,idat)=-zf(2,i1,i3,i2,idat) 4569 end do 4570 end do 4571 end if 4572 end do 4573 end if 4574 4575 end do ! idat 4576 4577 call dfftw_destroy_plan(fw_plan3_lot) 4578 if (mod(m1, lot3) /= 0) then 4579 call dfftw_destroy_plan(fw_plan3_rest) 4580 end if 4581 4582 call dfftw_destroy_plan(fw_plan1_lot) 4583 if (mod(m2eff, lot1) /= 0) then 4584 call dfftw_destroy_plan(fw_plan1_rest) 4585 end if 4586 4587 call dfftw_destroy_plan(fw_plan2_lot) 4588 if (mod(n1eff, lot2) /= 0) then 4589 call dfftw_destroy_plan(fw_plan2_rest) 4590 end if 4591 4592 ABI_FREE(zmpi2) 4593 ABI_FREE(zw) 4594 ABI_FREE(zt) 4595 if (nproc_fft>1) then 4596 ABI_FREE(zmpi1) 4597 end if 4598 4599 #else 4600 ABI_ERROR("FFTW3 support not activated") 4601 ABI_UNUSED((/cplexwf,ndat,n1,n2,n3,nd1,nd2,nd3proc/)) 4602 ABI_UNUSED((/max1,max2,max3,m1,m2,m3,md1,md2proc,md3,comm_fft/)) 4603 ABI_UNUSED((/zf(1,1,1,1,1),zr(1,1,1,1,1)/)) 4604 #endif 4605 4606 end subroutine fftw3_mpiforw_wf
m_fftw3/fftw3_mpifourdp_c2c [ Functions ]
[ Top ] [ m_fftw3 ] [ Functions ]
NAME
fftw3_mpifourdp_c2c
FUNCTION
Driver routine for many out-of-place 3D complex-to-complex FFTs of lengths n1, n2, n3.
INPUTS
cplex=1 if fofr is real, 2 if fofr is complex nfft=(effective) number of FFT grid points (for this processor) ndat=Number of FFTs to be done. isign=sign of Fourier transform exponent: current convention uses +1 for transforming from G to r, -1 for transforming from r to G. fftn2_distrib(n2)= rank of the processor which own fft planes in 2nd dimension for fourdp ffti2_local(n2) = local i2 indices in fourdp fftn3_distrib(n3) = rank of the processor which own fft planes in 3rd dimension for fourdp ffti3_local(n3) = local i3 indices in fourdp comm_fft=MPI communicator for the FFT [fftw_flags]=Flags used to create the plan. They can be combined with the "+" operator. fin(2*ldx*ldy*ldz*ndat)=The complex array to be transformed.
TODO
Add c2r and r2c version.
SIDE EFFECTS
Input/Output fofg(2,nfft*ndat)=f(G), complex. fofr(cplex*nfft*ndat)=input function f(r) (real or complex)
SOURCE
3795 subroutine fftw3_mpifourdp_c2c(cplex,nfft,ngfft,ndat,isign,& 3796 & fftn2_distrib,ffti2_local,fftn3_distrib,ffti3_local,fofg,fofr,comm_fft,fftw_flags) 3797 3798 !Arguments ------------------------------------ 3799 !scalars 3800 integer,intent(in) :: cplex,isign,nfft,ndat,comm_fft 3801 integer,optional,intent(in) :: fftw_flags 3802 !arrays 3803 integer,intent(in) :: ngfft(18) 3804 integer,intent(in) :: fftn2_distrib(ngfft(2)),ffti2_local(ngfft(2)) 3805 integer,intent(in) :: fftn3_distrib(ngfft(3)),ffti3_local(ngfft(3)) 3806 real(dp),intent(inout) :: fofg(2,nfft*ndat),fofr(cplex*nfft*ndat) 3807 3808 #ifdef HAVE_FFTW3_MPI 3809 !Local variables------------------------------- 3810 !scalars 3811 integer,parameter :: rank3=3 3812 integer :: n1,n2,n3,n4,n5,n6,nd2proc,nd3proc,my_flags,me_fft,nproc_fft 3813 integer(C_INTPTR_T) :: alloc_local,local_n0,local_0_start,local_n1,local_1_start 3814 type(C_PTR) :: plan,cptr_cdata 3815 !arrays 3816 integer(C_INTPTR_T) :: fft_sizes(4) 3817 complex(C_DOUBLE_COMPLEX), ABI_CONTIGUOUS pointer :: f03_cdata(:) 3818 3819 !************************************************************************* 3820 3821 my_flags=ABI_FFTW_ESTIMATE; if (PRESENT(fftw_flags)) my_flags= fftw_flags 3822 3823 n1=ngfft(1); n2=ngfft(2); n3=ngfft(3) 3824 ! No augmentation as FFTW3 does not support it 3825 n4=n1; n5=n2; n6=n3 3826 me_fft=ngfft(11); nproc_fft=ngfft(10) 3827 3828 nd2proc=((n2-1)/nproc_fft) +1 3829 nd3proc=((n6-1)/nproc_fft) +1 3830 3831 ! Get local data size and allocate (note dimension reversal, we call the C interface directly!) 3832 fft_sizes = [n3,n2,n1,ndat] 3833 3834 ! Use TRANSPOSED_OUT 3835 my_flags = ior(ABI_FFTW_ESTIMATE, ABI_FFTW_MPI_TRANSPOSED_OUT) 3836 3837 if (isign == ABI_FFTW_BACKWARD) then 3838 ! G --> R, Exchange n2 and n3 3839 fft_sizes = [n2,n3,n1,ndat] 3840 !my_flags = ior(ABI_FFTW_ESTIMATE, ABI_FFTW_MPI_TRANSPOSED_IN) 3841 end if 3842 3843 alloc_local = fftw_mpi_local_size_many_transposed(& 3844 & rank3,fft_sizes(1:3),fft_sizes(4), & 3845 & FFTW_MPI_DEFAULT_BLOCK,FFTW_MPI_DEFAULT_BLOCK, comm_fft, & 3846 & local_n0,local_0_start, & 3847 & local_n1,local_1_start) 3848 3849 ! C to F 3850 !local_0_start = local_0_start + 1 3851 !local_1_start = local_1_start + 1 3852 !write(std_out,*)"local_n0,local_0_start,alloc_local",local_n0,local_0_start,alloc_local 3853 !write(std_out,*)"local_n1,local_1_start,alloc_local",local_n1,local_1_start,alloc_local 3854 3855 ! Allocate cptr_cdata, associate to F pointer and build the plane. 3856 cptr_cdata = fftw_alloc_complex(alloc_local) 3857 3858 call c_f_pointer(cptr_cdata, f03_cdata, [alloc_local]) 3859 3860 plan = fftw_mpi_plan_many_dft(rank3,fft_sizes(1:3),fft_sizes(4), & 3861 & FFTW_MPI_DEFAULT_BLOCK,FFTW_MPI_DEFAULT_BLOCK, & 3862 & f03_cdata,f03_cdata,comm_fft,isign,my_flags) 3863 3864 select case (isign) 3865 case (ABI_FFTW_BACKWARD) 3866 ! G --> R 3867 ABI_CHECK(local_n0 == nd2proc, "local_n0 != nd2proc") 3868 3869 call mpifft_fg2dbox_dpc(nfft,ndat,fofg,n1,n2,n3,n4,nd2proc,n6,fftn2_distrib,ffti2_local,me_fft,f03_cdata) 3870 3871 ! Compute transform. 3872 call fftw_mpi_execute_dft(plan, f03_cdata, f03_cdata) 3873 3874 call mpifft_dbox2fr_dpc(n1,n2,n3,n4,n5,nd3proc,ndat,fftn3_distrib,ffti3_local,me_fft,f03_cdata,cplex,nfft,fofr) 3875 3876 case (ABI_FFTW_FORWARD) 3877 ! R --> G 3878 ABI_CHECK(local_n0 == nd3proc, "local_n0 != nd3proc") 3879 3880 call mpifft_fr2dbox_dpc(cplex,nfft,ndat,fofr,n1,n2,n3,n4,n5,nd3proc,fftn3_distrib,ffti3_local,me_fft,f03_cdata) 3881 3882 ! Compute transform. 3883 call fftw_mpi_execute_dft(plan, f03_cdata, f03_cdata) 3884 3885 ! Scale results. 3886 call mpifft_dbox2fg_dpc(n1,n2,n3,n4,nd2proc,n6,ndat,fftn2_distrib,ffti2_local,me_fft,f03_cdata,nfft,fofg) 3887 3888 case default 3889 ABI_ERROR("Wrong sign") 3890 end select 3891 3892 call fftw_destroy_plan(plan) 3893 call fftw_free(cptr_cdata) 3894 3895 #else 3896 ABI_ERROR("FFTW3_MPI support not activated") 3897 ABI_UNUSED((/cplex,nfft,ngfft(1),ndat,isign,comm_fft/)) 3898 ABI_UNUSED((/fftn2_distrib(1),ffti2_local(1)/)) 3899 ABI_UNUSED((/fftn3_distrib(1),ffti3_local(1)/)) 3900 if (PRESENT(fftw_flags)) then 3901 ABI_UNUSED(fftw_flags) 3902 end if 3903 ABI_UNUSED(fofg(1,1)) 3904 ABI_UNUSED(fofr(1)) 3905 #endif 3906 3907 end subroutine fftw3_mpifourdp_c2c
m_fftw3/fftw3_mpifourdp_c2r [ Functions ]
[ Top ] [ m_fftw3 ] [ Functions ]
NAME
fftw3_mpifourdp_c2r
FUNCTION
Driver routine for transposed out-of-place 3D complex-to-real FFT of lengths nx, ny, nz.
INPUTS
nx,ny,nz=Number of point along the three directions. ndat=Number of FFTs to be done. fofg(2,nx*ny*nz*ndat)=The complex array to be transformed. comm_fft=MPI communicator. [fftw_flags]=Flags used to create the plan. They can be combined with the "+" operator. Defaults to ABI_FFTW_ESTIMATE.
OUTPUT
fofr(2,nx*ny*nz*ndat)=The backwards real FFT of ff.
NOTES
LOCAL DATA IN FOURIER SPACE : TRANSPOSED ORDER real space --> dim = [ nx | ny | nz/np_fft] fourier space --> dim = [ nx/2 | nz | ny/np_ff ]
SOURCE
3398 subroutine fftw3_mpifourdp_c2r(nfft,ngfft,ndat,& 3399 fftn2_distrib,ffti2_local,fftn3_distrib,ffti3_local,fofg,fofr,comm_fft,fftw_flags) 3400 3401 !Arguments ------------------------------------ 3402 !scalars 3403 integer,intent(in) :: nfft,ndat,comm_fft 3404 integer,optional,intent(in) :: fftw_flags 3405 !arrays 3406 integer,intent(in) :: ngfft(18) 3407 integer,intent(in) :: fftn2_distrib(ngfft(2)),ffti2_local(ngfft(2)) 3408 integer,intent(in) :: fftn3_distrib(ngfft(3)),ffti3_local(ngfft(3)) 3409 real(dp),intent(in) :: fofg(2,nfft*ndat) 3410 real(dp),intent(out) :: fofr(nfft*ndat) 3411 3412 !Local variables------------------------------- 3413 #ifdef HAVE_FFTW3_MPI 3414 !scalars 3415 integer,parameter :: rank3=3 3416 integer :: nx,ny,nz,nproc_fft 3417 type(C_PTR) :: plan_bw, cdata_cplx,cdata_real 3418 integer(C_INTPTR_T) :: i,j,jdat,k,alloc_local,fft_sizes(4),demi_nx,base,idat,kdat 3419 integer(C_INTPTR_T) :: local_n0, local_0_start, local_n1, local_1_start 3420 !arrays 3421 complex(C_DOUBLE_COMPLEX), ABI_CONTIGUOUS pointer :: data_cplx(:,:,:) 3422 real(C_DOUBLE), ABI_CONTIGUOUS pointer :: data_real(:,:,:) 3423 3424 ! ************************************************************************* 3425 3426 !ABI_CHECK(ndat==1, "ndat > 1 not implemented yet") 3427 3428 nx=ngfft(1); ny=ngfft(2); nz=ngfft(3) 3429 nproc_fft = xmpi_comm_size(comm_fft) 3430 3431 demi_nx = nx/2 + 1 3432 fft_sizes(1)=nz 3433 fft_sizes(2)=ny 3434 fft_sizes(3)=demi_nx 3435 fft_sizes(4)=ndat 3436 3437 alloc_local = fftw_mpi_local_size_many_transposed(& 3438 & rank3,fft_sizes(1:3),fft_sizes(4), & 3439 & FFTW_MPI_DEFAULT_BLOCK,FFTW_MPI_DEFAULT_BLOCK, comm_fft, & 3440 & local_n0,local_0_start, & 3441 & local_n1,local_1_start) 3442 3443 cdata_cplx = fftw_alloc_complex(alloc_local) 3444 cdata_real = fftw_alloc_real(alloc_local*2) 3445 3446 ! OLD BY FDHAM 3447 ! dimensions are (x/2,z,y) in Fourier's Space 3448 call c_f_pointer(cdata_cplx, data_cplx, [demi_nx ,fft_sizes(1),local_n1]) 3449 ! dimensions in real space : (nx,ny,nz/nproc) 3450 call c_f_pointer(cdata_real, data_real, [2*demi_nx,fft_sizes(2),local_n0]) 3451 3452 ! dimensions are (x/2,z,y) in Fourier's Space 3453 !call c_f_pointer(cdata_cplx, data_cplx, [demi_nx ,fft_sizes(1),local_n0]) 3454 3455 !! dimensions in real space : (nx,ny,nz/nproc) 3456 !call c_f_pointer(cdata_real, data_real, [2*demi_nx,fft_sizes(2),local_n1]) 3457 3458 fft_sizes(3)=nx 3459 plan_bw = fftw_mpi_plan_many_dft_c2r(& 3460 & rank3,fft_sizes(1:3),fft_sizes(4), & 3461 & FFTW_MPI_DEFAULT_BLOCK,FFTW_MPI_DEFAULT_BLOCK, & 3462 & data_cplx, data_real , & 3463 & comm_fft,ior(ABI_FFTW_ESTIMATE,ABI_FFTW_MPI_TRANSPOSED_IN)) 3464 3465 do idat=1,ndat 3466 do k=1, nz 3467 do j=1, ny / nproc_fft 3468 jdat = j + (idat-1) * ny / nproc_fft 3469 base = nx*((j-1) + (ny/nproc_fft)*(k-1)) + (idat-1) * nfft 3470 do i=1, demi_nx 3471 data_cplx(i,k,jdat) = CMPLX(fofg(1, i + base), fofg(2, i + base), kind=C_DOUBLE_COMPLEX) 3472 end do 3473 end do 3474 end do 3475 end do 3476 3477 ! compute transform (as many times as desired) 3478 call fftw_mpi_execute_dft_c2r(plan_bw, data_cplx, data_real) 3479 3480 do idat=1,ndat 3481 do k=1,local_n0 3482 kdat = k + (idat - 1) * local_n0 3483 do j=1,ny 3484 base = nx*((j-1) + ny*(k-1)) + (idat - 1) * nfft 3485 do i=1,nx 3486 fofr(i+base) = data_real(i,j,kdat) 3487 end do 3488 end do 3489 end do 3490 end do 3491 3492 call fftw_destroy_plan(plan_bw) 3493 call fftw_free(cdata_cplx) 3494 call fftw_free(cdata_real) 3495 3496 #else 3497 ABI_ERROR("FFTW3_MPI support not activated") 3498 ABI_UNUSED((/nfft,ngfft(1),ndat,comm_fft/)) 3499 ABI_UNUSED((/fftn2_distrib(1),ffti2_local(1)/)) 3500 ABI_UNUSED((/fftn3_distrib(1),ffti3_local(1)/)) 3501 if (PRESENT(fftw_flags)) then 3502 ABI_UNUSED(fftw_flags) 3503 end if 3504 ABI_UNUSED(fofg(1,1)) 3505 ABI_UNUSED(fofr(1)) 3506 #endif 3507 3508 end subroutine fftw3_mpifourdp_c2r
m_fftw3/fftw3_mpifourdp_r2c [ Functions ]
[ Top ] [ m_fftw3 ] [ Functions ]
NAME
fftw3_mpifourdp_r2c
FUNCTION
Driver routine for out-of-place 3D real-to-complex FFT of lengths nx, ny, nz.
INPUTS
fofr(nx*ny*nz*ndat)=The real array to be transformed. ndat=Number of FFTs to be done. comm_fft=MPI communicator for the FFT. [fftw_flags]=Flags used to create the plan. They can be combined with the "+" operator. Defaults to ABI_FFTW_ESTIMATE.
OUTPUT
fofg(2,nx*ny*nz*ndat)=The forward FFT of ff.
NOTES
LOCAL DATA FOR FOURIER TRANSFORMS : TRANSPOSED ORDER AND DISTRIBUTED real space --> dim = [ nx | ny | nz/np_fft ] fourier space --> dim = [ nx | nz | ny/np_fft ] we can't take in account the symetric of the real case because after fft have been computed, the symetric data needed are dispatched over other process in parallel
SOURCE
3541 subroutine fftw3_mpifourdp_r2c(nfft,ngfft,ndat,& 3542 fftn2_distrib,ffti2_local,fftn3_distrib,ffti3_local,fofg,fofr,comm_fft,fftw_flags) 3543 3544 !Arguments ------------------------------------ 3545 !scalars 3546 integer,intent(in) :: nfft,ndat,comm_fft 3547 integer,optional,intent(in) :: fftw_flags 3548 !arrays 3549 integer,intent(in) :: ngfft(18) 3550 integer,intent(in) :: fftn2_distrib(ngfft(2)),ffti2_local(ngfft(2)) 3551 integer,intent(in) :: fftn3_distrib(ngfft(3)),ffti3_local(ngfft(3)) 3552 real(dp),intent(in) :: fofr(nfft*ndat) 3553 real(dp),intent(out) :: fofg(2,nfft*ndat) 3554 3555 !Local variables------------------------------- 3556 #ifdef HAVE_FFTW3_MPI 3557 !scalars 3558 integer,parameter :: rank3=3 3559 integer :: my_flags,nproc_fft,nx,ny,nz 3560 integer(C_INTPTR_T) :: i,j,k,base,alloc_local,i1,i2,i3,igf,idat,kdat,i2dat,padatf 3561 integer(C_INTPTR_T) :: local_n0,local_0_start,local_n1,local_1_start 3562 real(dp) :: factor_fft 3563 type(C_PTR) :: plan_fw,cdata_cplx,cdata_real 3564 !arrays 3565 complex(C_DOUBLE_COMPLEX), ABI_CONTIGUOUS pointer :: data_cplx(:,:,:),data_real(:,:,:) 3566 integer(C_INTPTR_T) :: fft_sizes(4) 3567 3568 ! ************************************************************************* 3569 3570 nproc_fft = xmpi_comm_size(comm_fft) 3571 3572 nx=ngfft(1); ny=ngfft(2); nz=ngfft(3) 3573 3574 my_flags=ABI_FFTW_ESTIMATE; if (PRESENT(fftw_flags)) my_flags= fftw_flags 3575 3576 fft_sizes(1)=nz 3577 fft_sizes(2)=ny 3578 fft_sizes(3)=nx 3579 fft_sizes(4)=ndat 3580 3581 ! Get parallel sizes 3582 alloc_local = fftw_mpi_local_size_many_transposed(& 3583 & rank3,fft_sizes(1:3),fft_sizes(4), & 3584 & FFTW_MPI_DEFAULT_BLOCK,FFTW_MPI_DEFAULT_BLOCK, comm_fft, & 3585 & local_n0,local_0_start, & 3586 & local_n1,local_1_start) 3587 3588 ! Allocate data and reference it 3589 3590 ! local data in real space --> dim = [nx | ny | nz/nproc_fft] 3591 cdata_real = fftw_alloc_complex(alloc_local) 3592 call c_f_pointer(cdata_real, data_real, [fft_sizes(3),fft_sizes(2),local_n0]) 3593 3594 ! local data in Fourier space --> dim = [nx | nz | ny/nproc_fft] 3595 cdata_cplx = fftw_alloc_complex(alloc_local) 3596 call c_f_pointer(cdata_cplx, data_cplx, [fft_sizes(3),fft_sizes(1),local_n1]) 3597 3598 ! TODO: Use true real to complex API! 3599 ! Create Plan C2C (nx,ny,nz) 3600 plan_fw = fftw_mpi_plan_many_dft(& 3601 & rank3,fft_sizes(1:3),fft_sizes(4), & 3602 & FFTW_MPI_DEFAULT_BLOCK,FFTW_MPI_DEFAULT_BLOCK, & 3603 & data_real, data_cplx , & 3604 & comm_fft,ABI_FFTW_FORWARD,ior(ABI_FFTW_ESTIMATE,ABI_FFTW_MPI_TRANSPOSED_OUT)) 3605 3606 ! Copy input data in correct format 3607 do idat=1,ndat 3608 do k=1,local_n0 3609 kdat = k + (idat-1) * local_n0 3610 do j=1, ny 3611 base = nx*((j-1) + ny*(k-1)) + (idat-1) * nfft 3612 do i=1, nx 3613 data_real(i,j,kdat) = CMPLX(fofr(i+base),zero, kind=C_DOUBLE_COMPLEX) 3614 end do 3615 end do 3616 end do 3617 end do 3618 3619 ! Compute transform 3620 call fftw_mpi_execute_dft(plan_fw, data_real, data_cplx) 3621 3622 factor_fft = one / (nx*ny*nz) 3623 3624 do idat=1,ndat 3625 padatf=(idat-1)*nfft 3626 do i3=1,nz 3627 do i2=1,ny/nproc_fft ! equivalent a local_n1 3628 i2dat = i2 + (idat-1) * ny/nproc_fft 3629 do i1=1,nx 3630 igf = i1 + nx*( (i2-1) + (i3-1)*ny/nproc_fft ) + padatf 3631 fofg(1,igf) = real(data_cplx(i1,i3,i2dat)) * factor_fft 3632 fofg(2,igf) =aimag(data_cplx(i1,i3,i2dat)) * factor_fft 3633 end do 3634 end do 3635 end do 3636 end do 3637 3638 call fftw_destroy_plan(plan_fw) 3639 call fftw_free(cdata_cplx) 3640 call fftw_free(cdata_real) 3641 3642 #else 3643 ABI_ERROR("FFTW3_MPI support not activated") 3644 ABI_UNUSED((/nfft,ngfft(1),ndat,comm_fft/)) 3645 ABI_UNUSED((/fftn2_distrib(1),ffti2_local(1)/)) 3646 ABI_UNUSED((/fftn3_distrib(1),ffti3_local(1)/)) 3647 if (PRESENT(fftw_flags)) then 3648 ABI_UNUSED(fftw_flags) 3649 end if 3650 ABI_UNUSED(fofg(1,1)) 3651 ABI_UNUSED(fofr(1)) 3652 #endif 3653 3654 end subroutine fftw3_mpifourdp_r2c
m_fftw3/fftw3_plan3_t [ Types ]
NAME
fftw3_plan3_t
FUNCTION
Structure storing the pointer to the FFTW plan as well as the options used to generate it.
SOURCE
161 type,private :: fftw3_plan3_t 162 integer :: isign=0 ! Sign of the exponential in the FFT 163 integer :: ndat=-1 ! Number of FFTs associated to the plan 164 integer :: flags=-HUGE(0) ! FFTW3 flags used to construct the plan. 165 integer(KIND_FFTW_PLAN) :: plan=NULL_PLAN ! FFTW3 plan. 166 integer :: nthreads=1 ! The number of threads associated to the plan. 167 integer :: idist=-1 168 integer :: odist=-1 169 integer :: istride=-1 170 integer :: ostride=-1 171 integer :: n(3)=-1 ! The number of FFT divisions. 172 integer :: inembed(3)=-1 173 integer :: onembed(3)=-1 174 !integer(C_INT) :: alignment(2) ! The alignment of the arrays used to construct the plan. 175 end type fftw3_plan3_t
m_fftw3/fftw3_poisson [ Functions ]
[ Top ] [ m_fftw3 ] [ Functions ]
NAME
fftw3_poisson
FUNCTION
Solve the Poisson equation in G-space given the density, n(r), in real space of the FFT box.
INPUTS
cplex=1 if fofr is real, 2 if fofr is complex nx,ny,nz=Number of FFT points along the three directions. ldx,ldy,ldz=Leading dimension of the array nr and vg. ndat = Number of densities vg(nx*ny*nz)=Potential in reciprocal space.
SIDE EFFECTS
nr(cplex*ldx*ldy*ldz*ndat) input: n(r) (real or complex) output: the hartree potential in real space
NOTES
vg is given on the FFT mesh instead of the augmented mesh [ldx,ldy,ldz] in order to simplify the interface with the other routines operating of vg
SOURCE
7250 subroutine fftw3_poisson(cplex,nx,ny,nz,ldx,ldy,ldz,ndat,vg,nr) 7251 7252 !Arguments ------------------------------------ 7253 !scalars 7254 integer,intent(in) :: cplex,nx,ny,nz,ldx,ldy,ldz,ndat 7255 !arrays 7256 real(dp),intent(inout) :: nr(cplex*ldx*ldy*ldz*ndat) 7257 real(dp),intent(in) :: vg(nx*ny*nz) 7258 7259 #ifdef HAVE_FFTW3 7260 !Local variables------------------------------- 7261 !scalars 7262 integer,parameter :: rank1=1,rank2=2 7263 integer :: ii,jj,kk,sidx,ig,ir,vgbase,ypad 7264 integer, parameter :: nthreads=1 7265 integer(KIND_FFTW_PLAN) :: bw_plan_xy,bw_plan3 7266 integer(KIND_FFTW_PLAN) :: fw_plan_xy,fw_plan3 7267 real(dp) :: fft_fact,vg_fftfact 7268 7269 ! ************************************************************************* 7270 7271 !write(std_out,*)"in poisson" 7272 ABI_CHECK(cplex==2,"cplex!=2 not coded") 7273 ABI_CHECK(ndat==1,"ndat!=1 not coded") 7274 7275 fft_fact = one/(nx*ny*nz) 7276 7277 ! The prototype for sfftw_plan_many_dft is: 7278 ! sfftw_plan_many_dft(n, howmany, 7279 ! fin, iembed, istride, idist, 7280 ! fout, oembed, ostride, odist, isign, my_flags) 7281 7282 ! 1) ldx*ldy transforms along Rz. 7283 fw_plan3 = fftw3_plan_many_dft(rank1, (/nz/), ldx*ldy, & ! We have to visit the entire augmented x-y plane! 7284 & nr, (/ldx, ldy, ldz/), ldx*ldy, 1, & 7285 & nr, (/ldx, ldy, ldz/), ldx*ldy, 1, ABI_FFTW_FORWARD, ABI_FFTW_ESTIMATE, nthreads) 7286 7287 call fftw3_execute_dft(fw_plan3, nr, nr) ! Now we have nr(x,y,Gz) 7288 call fftw3_destroy_plan(fw_plan3) 7289 7290 ! R --> G Transforms in x-y plane 7291 fw_plan_xy = fftw3_plan_many_dft(rank2, [nx,ny], 1, & 7292 & nr, (/ldx, ldy, ldz/), 1, 1, & 7293 & nr, (/ldx, ldy, ldz/), 1, 1, ABI_FFTW_FORWARD, ABI_FFTW_ESTIMATE, nthreads) 7294 7295 ! G --> R Transforms in x-y plane 7296 bw_plan_xy = fftw3_plan_many_dft(rank2, [nx, ny], 1, & 7297 & nr, (/ldx, ldy, ldz/), 1, 1, & 7298 & nr, (/ldx, ldy, ldz/), 1, 1, ABI_FFTW_BACKWARD, ABI_FFTW_ESTIMATE, nthreads) 7299 7300 ! Loop on z-planes. 7301 do kk=1,nz 7302 sidx = 1 + cplex*(kk-1)*ldx*ldy !+ cplex*(dat-1) * ldx*ldy*ldz 7303 7304 call fftw3_execute_dft(fw_plan_xy, nr(sidx:), nr(sidx:)) 7305 7306 ! At this point we have nr(Gx,Gy,Gz) on the current plane. 7307 ! Multiply by vc(Gx,Gy,Gz) and then back transform immediately to get vc(x,y,Gz) 7308 ! Note that nr is complex whereas vg is real. 7309 ! Besides, FFTW returns not normalized FTs if sign=-1 so we have to scale by fft_fact 7310 vgbase = (kk-1)*nx*ny !;vgbase = (kk-1)*ldx*ldy 7311 7312 ig = 0 7313 do jj=1,ny 7314 ypad = cplex*(jj-1)*ldx + sidx 7315 do ii=1,nx 7316 ig = ig + 1 7317 vg_fftfact = vg(vgbase+ig) * fft_fact 7318 7319 ir = cplex*(ii-1) + ypad 7320 nr(ir:ir+1) = nr(ir:ir+1) * vg_fftfact 7321 end do 7322 end do 7323 7324 call fftw3_execute_dft(bw_plan_xy, nr(sidx:), nr(sidx:)) 7325 end do 7326 7327 ! Free plans 7328 call fftw3_destroy_plan(fw_plan_xy) 7329 call fftw3_destroy_plan(bw_plan_xy) 7330 7331 ! Final transforms of vc(x,y,Gz) along Gz to get vc(x,y,z) 7332 bw_plan3 = fftw3_plan_many_dft(rank1, (/nz/), ldx*ldy, & ! We have to visit the entire augmented x-y plane! 7333 & nr, (/ldx, ldy, ldz/), ldx*ldy, 1, & 7334 & nr, (/ldx, ldy, ldz/), ldx*ldy, 1, ABI_FFTW_BACKWARD, ABI_FFTW_ESTIMATE, nthreads) 7335 7336 call fftw3_execute_dft(bw_plan3, nr, nr) 7337 call fftw3_destroy_plan(bw_plan3) 7338 7339 #else 7340 ABI_UNUSED((/cplex,nx,ny,nz,ldx,ldy,ldz,ndat/)) 7341 ABI_UNUSED((/nr(1),vg(1)/)) 7342 #endif 7343 7344 end subroutine fftw3_poisson 7345 !!** 7346 7347 !---------------------------------------------------------------------- 7348 7349 END MODULE m_fftw3
m_fftw3/fftw3_r2c_op [ Functions ]
[ Top ] [ m_fftw3 ] [ Functions ]
NAME
fftw3_r2c_op
FUNCTION
Driver routine for out-of-place 3D real-to-complex FFT of lengths nx, ny, nz.
INPUTS
nx,ny,nz=Number of points along the three directions. ldx,ldy,ldz=Physical dimensions of the f array (to avoid cache conflicts). ff(ldx*ldy*ldz*ndat)=The real array to be transformed. ndat=Number of FFTs to be done. [fftw_flags]=Flags used to create the plan. They can be combined with the "+" operator. Defaults to ABI_FFTW_ESTIMATE.
OUTPUT
gg(2,nx*ny*nz*ndat)=The forward FFT of ff.
NOTES
FIXME For the time-being. No augmentation of the mesh to reduce memory conflicts, as MKL crashes if the advanced interface is used.
SOURCE
1792 subroutine fftw3_r2c_op(nx,ny,nz,ldx,ldy,ldz,ndat,ff,gg,fftw_flags) 1793 1794 !Arguments ------------------------------------ 1795 !scalars 1796 integer,intent(in) :: nx,ny,nz,ldx,ldy,ldz,ndat 1797 integer,optional,intent(in) :: fftw_flags 1798 !arrays 1799 real(dp),intent(in) :: ff(ldx*ldy*ldz*ndat) 1800 real(dp),intent(out) :: gg(2,ldx*ldy*ldz*ndat) 1801 1802 #ifdef HAVE_FFTW3 1803 !Local variables------------------------------- 1804 !scalars 1805 integer,parameter :: rank3=3,nt_all=-1 1806 integer :: nhp,my_flags,idist,odist,padx,i1,i2,i3,igp,igf,imgf,stride 1807 integer :: i1inv,i2inv,i3inv,idat,padatf 1808 integer(KIND_FFTW_PLAN) :: my_plan 1809 !arrays 1810 integer :: inembed(rank3),onembed(rank3),n(rank3) 1811 integer,allocatable :: i1inver(:),i2inver(:),i3inver(:) 1812 real(dp),allocatable :: gg_hp(:,:) 1813 1814 ! ************************************************************************* 1815 1816 my_flags=ABI_FFTW_ESTIMATE; if (PRESENT(fftw_flags)) my_flags= fftw_flags 1817 1818 idist = ldx*ldy*ldz 1819 nhp = (nx/2+1)*ny*nz 1820 odist = nhp 1821 1822 stride = 1 1823 n = (/nx,ny,nz/) 1824 inembed= (/ldx,ldy,ldz/) 1825 onembed= (/(nx/2+1),ny,nz/) 1826 1827 ABI_MALLOC(gg_hp,(2,nhp*ndat)) 1828 1829 #ifdef DEV_RC_BUG 1830 if (ndat/=1) ABI_ERROR("ndat/=1 + MKL not coded") 1831 1832 if (ANY( n /= inembed )) then 1833 ABI_ERROR("Augmentation not supported") 1834 end if 1835 1836 call dfftw_plan_dft_r2c_3d(my_plan, nx, ny, nz, ff, gg_hp, my_flags) 1837 if (my_plan==NULL_PLAN) then 1838 ABI_ERROR("dfftw_plan_dft_r2c_3d returned NULL_PLAN") 1839 end if 1840 1841 !fftw_plan fftw_plan_many_dft_r2c(int rank3, const int *n, int howmany, 1842 ! double *in, const int *inembed, int istride, int idist, 1843 ! fftw_complex *out, const int *onembed, int ostride, int odist, unsigned flags); 1844 #else 1845 my_plan = dplan_many_dft_r2c(rank3, n, ndat, ff, inembed, stride, idist, gg_hp, onembed, stride, odist, my_flags, nt_all) 1846 #endif 1847 1848 ! Now perform the 3D FFT via FFTW. r2c are always ABI_FFTW_FORWARD 1849 call dfftw_execute_dft_r2c(my_plan, ff, gg_hp) 1850 1851 call fftw3_destroy_plan(my_plan) 1852 1853 ! FFTW returns not normalized FTs 1854 call ZDSCAL(nhp*ndat, one/(nx*ny*nz), gg_hp, 1) 1855 1856 ! Reconstruct full FFT: Hermitian redundancy: out[i] is the conjugate of out[n-i] 1857 padx = (nx/2+1) 1858 ABI_MALLOC(i1inver,(padx)) 1859 ABI_MALLOC(i2inver,(ny)) 1860 ABI_MALLOC(i3inver,(nz)) 1861 1862 i1inver(1)=1 1863 do i1=2,padx 1864 i1inver(i1)=nx+2-i1 1865 end do 1866 1867 i2inver(1)=1 1868 do i2=2,ny 1869 i2inver(i2)=ny+2-i2 1870 end do 1871 1872 i3inver(1)=1 1873 do i3=2,nz 1874 i3inver(i3)=nz+2-i3 1875 end do 1876 1877 igp=0 1878 do idat=1,ndat 1879 padatf = (idat-1)*ldx*ldy*ldz 1880 do i3=1,nz 1881 i3inv = i3inver(i3) 1882 do i2=1,ny 1883 i2inv = i2inver(i2) 1884 do i1=1,padx 1885 igp = igp+1 1886 igf = i1 + (i3-1)*ldx*ldy + (i2-1)*ldx + padatf 1887 gg(:,igf) = gg_hp(:,igp) 1888 i1inv = i1inver(i1) 1889 if (i1inv/=i1) then 1890 imgf = i1inv + (i3inv-1)*ldx*ldy + (i2inv-1)*ldx + padatf 1891 gg(1,imgf) = gg_hp(1,igp) 1892 gg(2,imgf) = -gg_hp(2,igp) 1893 end if 1894 end do 1895 end do 1896 end do 1897 end do 1898 1899 ABI_FREE(i1inver) 1900 ABI_FREE(i2inver) 1901 ABI_FREE(i3inver) 1902 ABI_FREE(gg_hp) 1903 1904 #else 1905 ABI_ERROR("FFTW3 support not activated") 1906 ABI_UNUSED((/nx,ny,nz,ldx,ldy,ldz/)) 1907 ABI_UNUSED(ff) 1908 ABI_UNUSED(gg(1,1)) 1909 if (PRESENT(fftw_flags)) then 1910 ABI_UNUSED(fftw_flags) 1911 end if 1912 #endif 1913 1914 end subroutine fftw3_r2c_op
m_fftw3/fftw3_seqfourdp [ Functions ]
[ Top ] [ m_fftw3 ] [ Functions ]
NAME
fftw3_seqfourdp
FUNCTION
Driver routine for 3D FFT of lengths nx, ny, nz. Mainly used for densities or potentials. FFT Transform is out-of-place
INPUTS
cplex=1 if fofr is real, 2 if fofr is complex nx,ny,nz=Number of point along the three directions. ldx,ldy,ldz=Leading dimension of the array. ndat = Number of FFTS isign= +1 : fofg(G) => fofr(R); -1 : fofr(R) => fofg(G) fofg(2,ldx*ldy*ldz*ndat)=The array to be transformed. [fftw_flags]=Flags used to create the plan. They can be combined with the "+" operator. Defaults to FFTW_ESTIMATE.
OUTPUT
fofr(cplex,ldx*ldy*ldz*ndat)=The FFT of fofg
SOURCE
294 subroutine fftw3_seqfourdp(cplex,nx,ny,nz,ldx,ldy,ldz,ndat,isign,fofg,fofr,fftw_flags) 295 296 !Arguments ------------------------------------ 297 !scalars 298 integer,intent(in) :: cplex,nx,ny,nz,ldx,ldy,ldz,ndat,isign 299 integer,optional,intent(in) :: fftw_flags 300 !arrays 301 real(dp),intent(inout) :: fofg(2*ldx*ldy*ldz*ndat) 302 real(dp),intent(inout) :: fofr(cplex*ldx*ldy*ldz*ndat) 303 304 !Local variables------------------------------- 305 !scalars 306 integer,parameter :: iscale1 = 1 307 integer :: my_flags,ii,jj 308 complex(spc), allocatable :: work_sp(:) 309 310 ! ************************************************************************* 311 312 my_flags = ABI_FFTW_ESTIMATE; if (PRESENT(fftw_flags)) my_flags= fftw_flags 313 314 select case (cplex) 315 case (2) 316 ! Complex to Complex. 317 if (fftcore_mixprec == 1) then 318 ! Mixed precision: copyin + in-place + copyout 319 ABI_MALLOC(work_sp, (ldx*ldy*ldz*ndat)) 320 if (isign == ABI_FFTW_BACKWARD) then ! +1 321 work_sp(:) = cmplx(fofg(1::2), fofg(2::2), kind=spc) 322 else if (isign == ABI_FFTW_FORWARD) then ! -1 323 work_sp(:) = cmplx(fofr(1::2), fofr(2::2), kind=spc) 324 else 325 ABI_BUG("Wrong isign") 326 end if 327 328 call fftw3_c2c_ip_spc(nx, ny, nz, ldx, ldy, ldz, ndat, iscale1, isign, work_sp, fftw_flags=my_flags) 329 330 if (isign == ABI_FFTW_BACKWARD) then ! +1 331 jj = 1 332 do ii=1,ldx*ldy*ldz*ndat 333 fofr(jj) = real(work_sp(ii), kind=dp) 334 fofr(jj+1) = aimag(work_sp(ii)) 335 jj = jj + 2 336 end do 337 else if (isign == ABI_FFTW_FORWARD) then ! -1 338 jj = 1 339 do ii=1,ldx*ldy*ldz*ndat 340 fofg(jj) = real(work_sp(ii), kind=dp) 341 fofg(jj+1) = aimag(work_sp(ii)) 342 jj = jj + 2 343 end do 344 end if 345 ABI_FREE(work_sp) 346 347 else 348 ! double precision version. 349 select case (isign) 350 case (ABI_FFTW_BACKWARD) ! +1 351 call fftw3_many_dft_op(nx,ny,nz,ldx,ldy,ldz,ndat,isign,fofg,fofr,fftw_flags=my_flags) 352 case (ABI_FFTW_FORWARD) ! -1 353 call fftw3_many_dft_op(nx,ny,nz,ldx,ldy,ldz,ndat,isign,fofr,fofg,fftw_flags=my_flags) 354 case default 355 ABI_BUG("Wrong isign") 356 end select 357 end if 358 359 case (1) 360 ! Real case. 361 select case (isign) 362 case (ABI_FFTW_FORWARD) 363 ! -1; R --> G 364 call fftw3_r2c_op(nx,ny,nz,ldx,ldy,ldz,ndat,fofr,fofg,fftw_flags=my_flags) 365 case (ABI_FFTW_BACKWARD) 366 ! +1; G --> R 367 call fftw3_c2r_op(nx,ny,nz,ldx,ldy,ldz,ndat,fofg,fofr,fftw_flags=my_flags) 368 case default 369 ABI_BUG("Wrong isign") 370 end select 371 372 case default 373 ABI_BUG(" Wrong value for cplex") 374 end select 375 376 end subroutine fftw3_seqfourdp
m_fftw3/fftw3_seqfourwf [ Functions ]
[ Top ] [ m_fftw3 ] [ Functions ]
NAME
fftw3_seqfourwf
FUNCTION
Carry out composite Fourier transforms between real and reciprocal (G) space. Wavefunctions, contained in a sphere in reciprocal space, can be FFT to real space. They can also be FFT from real space to a sphere. Also, the density maybe accumulated, and a local potential can be applied. The different options are : - option=0 --> reciprocal to real space and output the result. - option=1 --> reciprocal to real space and accumulate the density. - option=2 --> reciprocal to real space, apply the local potential to the wavefunction in real space and produce the result in reciprocal space. - option=3 --> real space to reciprocal space. NOTE that in this case, fftalg=1x1 MUST be used. This may be changed in the future.
INPUTS
cplex= if 1 , denpot is real, if 2 , denpot is complex (cplex=2 only allowed for option=2, and istwf_k=1) not relevant if option=0 or option=3, so cplex=0 can be used to minimize memory fofgin(2,npwin)=holds input wavefunction in G vector basis sphere. (intent(in) but the routine sphere can modify it for another iflag) gboundin(2*mgfft+8,2)=sphere boundary info for reciprocal to real space gboundout(2*mgfft+8,2)=sphere boundary info for real to reciprocal space istwf_k=option parameter that describes the storage of wfs kg_kin(3,npwin)=reduced planewave coordinates, input kg_kout(3,npwout)=reduced planewave coordinates, output mgfft=maximum size of 1D FFTs ndat=number of FFT to do in // ngfft(18)=contain all needed information about 3D FFT, see ~abinit/doc/variables/vargs.htm#ngfft npwin=number of elements in fofgin array (for option 0, 1 and 2) npwout=number of elements in fofgout array (for option 2 and 3) ldx,ldy,ldz=ngfft(4),ngfft(5),ngfft(6), dimensions of fofr. option= if 0: do direct FFT if 1: do direct FFT, then sum the density if 2: do direct FFT, multiply by the potential, then do reverse FFT if 3: do reverse FFT only weight_r=weight to be used for the accumulation of the density in real space (needed only when option=1)
OUTPUT
(see side effects)
SIDE EFFECTS
Input/Output for option==0, fofgin(2,npwin*ndat)=holds input wavefunction in G sphere; fofr(2,ldx*ldy*ldz) contains the output Fourier Transform of fofgin; no use of denpot, fofgout and npwout. for option==1, fofgin(2,npwin*ndat)=holds input wavefunction in G sphere; denpot(cplex*ldx,ldy,ldz) contains the input density at input, and the updated density at output (accumulated); no use of fofgout and npwout. for option==2, fofgin(2,npwin*ndat)=holds input wavefunction in G sphere; denpot(cplex*ldx,ldy,ldz) contains the input local potential; fofgout(2,npwout*ndat) contains the output function; for option==3, fofr(2,ldx*ldy*ldz*ndat) contains the input real space wavefunction; fofgout(2,npwout*ndat) contains its output Fourier transform; no use of fofgin and npwin.
SOURCE
444 subroutine fftw3_seqfourwf(cplex,denpot,fofgin,fofgout,fofr,gboundin,gboundout,istwf_k, & 445 kg_kin,kg_kout,mgfft,ndat,ngfft,npwin,npwout,ldx,ldy,ldz,option,weight_r,weight_i) 446 447 !Arguments ------------------------------------ 448 !scalars 449 integer,intent(in) :: cplex,istwf_k,ldx,ldy,ldz,ndat,npwin,npwout,option,mgfft 450 real(dp),intent(in) :: weight_i,weight_r 451 !arrays 452 integer,intent(in) :: gboundin(2*mgfft+8,2),gboundout(2*mgfft+8,2) 453 integer,intent(in) :: kg_kin(3,npwin),kg_kout(3,npwout),ngfft(18) 454 real(dp),intent(inout) :: denpot(cplex*ldx,ldy,ldz),fofgin(2,npwin*ndat) 455 real(dp),intent(inout) :: fofr(2,ldx*ldy*ldz*ndat) 456 real(dp),intent(out) :: fofgout(2,npwout*ndat) 457 458 !Local variables------------------------------- 459 !scalars 460 integer,parameter :: me_g0=1,ndat1=1 461 integer :: nx,ny,nz,fftalg,fftalga,fftalgc,fftcache,dat,ptg,ptr,ptgin,ptgout,nthreads 462 character(len=500) :: msg 463 logical :: use_fftrisc 464 !arrays 465 !real(dp),allocatable :: saveden(:,:,:) 466 #if 0 467 logical :: use_fftbox 468 integer,parameter :: shiftg(3)=(/0,0,0/) 469 integer :: symm(3,3) 470 #endif 471 472 ! ************************************************************************* 473 474 if (all(option /= [0, 1, 2, 3])) then 475 write(msg,'(a,i0,a)')' Option:',option,' is not allowed. Only option=0, 1, 2 or 3 are allowed presently.' 476 ABI_ERROR(msg) 477 end if 478 479 if (option == 1 .and. cplex /= 1) then 480 ABI_ERROR(sjoin("With option number 1, cplex must be 1 but it is cplex:", itoa(cplex))) 481 end if 482 483 if (option==2 .and. (cplex/=1 .and. cplex/=2)) then 484 ABI_ERROR(sjoin("With the option number 2, cplex must be 1 or 2, but it is cplex:", itoa(cplex))) 485 end if 486 487 nx=ngfft(1); ny=ngfft(2); nz=ngfft(3) 488 fftalg=ngfft(7); fftalga=fftalg/100; fftalgc=mod(fftalg,10) 489 fftcache=ngfft(8) 490 491 use_fftrisc = (fftalgc==2) 492 if (istwf_k==2.and.option==3) use_fftrisc = .FALSE. 493 if (istwf_k>2.and.ANY(option==(/0,3/))) use_fftrisc = .FALSE. 494 495 nthreads = xomp_get_num_threads(open_parallel=.TRUE.) 496 497 if (use_fftrisc) then 498 !call wrtout(std_out, calling fftw3_fftrisc") 499 500 if (ndat == 1) then 501 if (fftcore_mixprec == 0) then 502 call fftw3_fftrisc_dp(cplex,denpot,fofgin,fofgout,fofr,gboundin,gboundout,istwf_k,kg_kin,kg_kout,& 503 mgfft,ngfft,npwin,npwout,ldx,ldy,ldz,option,weight_r,weight_i) 504 else 505 call fftw3_fftrisc_mixprec(cplex,denpot,fofgin,fofgout,fofr,gboundin,gboundout,istwf_k,kg_kin,kg_kout,& 506 mgfft,ngfft,npwin,npwout,ldx,ldy,ldz,option,weight_r,weight_i) 507 end if 508 509 else 510 ! All this boilerplate code is needed because the caller might pass zero-sized arrays 511 ! for the arguments that are not referenced and we don't want to have problems at run-time. 512 ! Moreover option 1 requires a special treatment when threads are started at this level. 513 514 SELECT CASE (option) 515 CASE (0) 516 ! 517 ! fofgin -> fofr, no use of denpot, fofgout and npwout. 518 if (.not.fftw3_spawn_threads_here(ndat,nthreads)) then 519 do dat=1,ndat 520 ptg = 1 + (dat-1)*npwin 521 ptr = 1 + (dat-1)*ldx*ldy*ldz 522 call fftw3_fftrisc_dp(cplex,denpot,fofgin(1,ptg),fofgout,fofr(1,ptr),gboundin,gboundout,istwf_k,kg_kin,kg_kout,& 523 & mgfft,ngfft,npwin,npwout,ldx,ldy,ldz,option,weight_r,weight_i) 524 end do 525 else 526 !$OMP PARALLEL DO PRIVATE(ptg,ptr) 527 do dat=1,ndat 528 ptg = 1 + (dat-1)*npwin 529 ptr = 1 + (dat-1)*ldx*ldy*ldz 530 call fftw3_fftrisc_dp(cplex,denpot,fofgin(1,ptg),fofgout,fofr(1,ptr),gboundin,gboundout,istwf_k,kg_kin,kg_kout,& 531 & mgfft,ngfft,npwin,npwout,ldx,ldy,ldz,option,weight_r,weight_i) 532 end do 533 end if 534 535 CASE (1) 536 !fofgin -> local ur and accumulate density in denpot 537 ! TODO this is delicate part to do in parallel, as one should OMP reduce denpot. 538 ! but this causes problems with the stack. 539 540 do dat=1,ndat 541 ptg = 1 + (dat-1)*npwin 542 ptr = 1 + (dat-1)*ldx*ldy*ldz 543 call fftw3_fftrisc_dp(cplex,denpot,fofgin(1,ptg),fofgout,fofr,gboundin,gboundout,istwf_k,kg_kin,kg_kout,& 544 & mgfft,ngfft,npwin,npwout,ldx,ldy,ldz,option,weight_r,weight_i) 545 end do 546 547 ! This version doesn't seem efficient 548 !!! !$OMP PARALLEL PRIVATE(ptg,ptr,saveden) 549 !!! ABI_MALLOC(saveden, (ldx,ldy,ldz)) 550 !!! saveden = zero 551 !!! !$OMP DO 552 !!! do dat=1,ndat 553 !!! ptg = 1 + (dat-1)*npwin 554 !!! ptr = 1 + (dat-1)*ldx*ldy*ldz 555 !!! call fftw3_fftrisc_dp(cplex,saveden,fofgin(1,ptg),fofgout,fofr,gboundin,gboundout,istwf_k,kg_kin,kg_kout,& 556 !!! & mgfft,ngfft,npwin,npwout,ldx,ldy,ldz,option,weight_r) 557 !!! end do 558 !!! !$OMP END DO NOWAIT 559 !!! !$OMP CRITICAL (OMPC_addrho) 560 !!! denpot = denpot + saveden 561 !!! !$OMP END CRITICAL (OMPC_addrho) 562 !!! ABI_FREE(saveden) 563 !!! !$OMP END PARALLEL 564 565 CASE (2) 566 ! <G|vloc(r)|fofgin(r)> in fofgout 567 if (.not.fftw3_spawn_threads_here(ndat,nthreads)) then 568 do dat=1,ndat 569 ptgin = 1 + (dat-1)*npwin 570 ptgout = 1 + (dat-1)*npwout 571 if (fftcore_mixprec == 0) then 572 call fftw3_fftrisc_dp(cplex,denpot,fofgin(1,ptgin),fofgout(1,ptgout),fofr,gboundin,gboundout,& 573 istwf_k,kg_kin,kg_kout,mgfft,ngfft,npwin,npwout,ldx,ldy,ldz,option,weight_r,weight_i) 574 else 575 call fftw3_fftrisc_mixprec(cplex,denpot,fofgin(1,ptgin),fofgout(1,ptgout),fofr,gboundin,gboundout,& 576 istwf_k,kg_kin,kg_kout,mgfft,ngfft,npwin,npwout,ldx,ldy,ldz,option,weight_r,weight_i) 577 end if 578 end do 579 else 580 !$OMP PARALLEL DO PRIVATE(ptgin,ptgout) 581 do dat=1,ndat 582 ptgin = 1 + (dat-1)*npwin 583 ptgout = 1 + (dat-1)*npwout 584 call fftw3_fftrisc_dp(cplex,denpot,fofgin(1,ptgin),fofgout(1,ptgout),fofr,gboundin,gboundout,istwf_k,kg_kin,kg_kout,& 585 & mgfft,ngfft,npwin,npwout,ldx,ldy,ldz,option,weight_r,weight_i) 586 end do 587 end if 588 589 CASE (3) 590 ! fofr -> fofgout 591 if (.not.fftw3_spawn_threads_here(ndat,nthreads)) then 592 do dat=1,ndat 593 ptr = 1 + (dat-1)*ldx*ldy*ldz 594 ptgout = 1 + (dat-1)*npwout 595 call fftw3_fftrisc_dp(cplex,denpot,fofgin,fofgout(1,ptgout),fofr(1,ptr),gboundin,gboundout,istwf_k,kg_kin,kg_kout,& 596 & mgfft,ngfft,npwin,npwout,ldx,ldy,ldz,option,weight_r,weight_i) 597 end do 598 else 599 !$OMP PARALLEL DO PRIVATE(ptr,ptgout) 600 do dat=1,ndat 601 ptr = 1 + (dat-1)*ldx*ldy*ldz 602 ptgout = 1 + (dat-1)*npwout 603 call fftw3_fftrisc_dp(cplex,denpot,fofgin,fofgout(1,ptgout),fofr(1,ptr),gboundin,gboundout,istwf_k,kg_kin,kg_kout,& 604 & mgfft,ngfft,npwin,npwout,ldx,ldy,ldz,option,weight_r,weight_i) 605 end do 606 end if 607 608 CASE DEFAULT 609 write(msg,'(a,i0,a)')'Option',option,' is not allowed. Only option=0, 1, 2 or 3 are allowed presently.' 610 ABI_ERROR(msg) 611 END SELECT 612 613 end if 614 615 else 616 617 #if 1 618 SELECT CASE (option) 619 CASE (0) 620 ! 621 ! FFT u(g) --> u(r) 622 if (.not.fftw3_spawn_threads_here(ndat,nthreads)) then 623 call fftw3_fftug_dp(fftalg,fftcache,npwin,nx,ny,nz,ldx,ldy,ldz,ndat,istwf_k,mgfft,kg_kin,gboundin,fofgin,fofr) 624 else 625 !$OMP PARALLEL DO PRIVATE(ptg, ptr) 626 do dat=1,ndat 627 ptg = 1 + (dat-1)*npwin 628 ptr = 1 + (dat-1)*ldx*ldy*ldz 629 call fftw3_fftug_dp(fftalg,fftcache,npwin,nx,ny,nz,ldx,ldy,ldz,ndat1,& 630 & istwf_k,mgfft,kg_kin,gboundin,fofgin(1,ptg),fofr(1,ptr)) 631 end do 632 end if 633 634 CASE (1) 635 ! TODO this is delicate part to do in parallel, as one should OMP reduce denpot. 636 call fftw3_fftug_dp(fftalg,fftcache,npwin,nx,ny,nz,ldx,ldy,ldz,ndat,istwf_k,mgfft,kg_kin,gboundin,fofgin,fofr) 637 call cg_addtorho(nx,ny,nz,ldx,ldy,ldz,ndat,weight_r,weight_i,fofr,denpot) 638 639 CASE (2) 640 641 if (.not.fftw3_spawn_threads_here(ndat,nthreads)) then 642 call fftw3_fftug_dp(fftalg,fftcache,npwin,nx,ny,nz,ldx,ldy,ldz,ndat,istwf_k,mgfft,kg_kin,gboundin,fofgin,fofr) 643 call cg_vlocpsi(nx,ny,nz,ldx,ldy,ldz,ndat,cplex,denpot,fofr) 644 645 ! The data for option==2 is now in fofr. 646 call fftw3_fftpad_dp(fofr,nx,ny,nz,ldx,ldy,ldz,ndat,mgfft,-1,gboundout) 647 648 call cg_box2gsph(nx,ny,nz,ldx,ldy,ldz,ndat,npwout,kg_kout,fofr,fofgout) 649 else 650 651 !$OMP PARALLEL DO PRIVATE(ptg, ptr) 652 do dat=1,ndat 653 ptg = 1 + (dat-1)*npwin 654 ptr = 1 + (dat-1)*ldx*ldy*ldz 655 call fftw3_fftug_dp(fftalg,fftcache,npwin,nx,ny,nz,ldx,ldy,ldz,ndat1,& 656 & istwf_k,mgfft,kg_kin,gboundin,fofgin(1,ptg),fofr(1,ptr)) 657 658 call cg_vlocpsi(nx,ny,nz,ldx,ldy,ldz,ndat1,cplex,denpot,fofr(1,ptr)) 659 660 ! The data for option==2 is now in fofr. 661 call fftw3_fftpad_dp(fofr(1,ptr),nx,ny,nz,ldx,ldy,ldz,ndat1,mgfft,-1,gboundout) 662 663 ptg = 1 + (dat-1)*npwout 664 call cg_box2gsph(nx,ny,nz,ldx,ldy,ldz,ndat1,npwout,kg_kout,fofr(1,ptr),fofgout(1,ptg)) 665 end do 666 end if 667 668 CASE (3) 669 ! The data for option==3 is already in fofr. 670 if (.not.fftw3_spawn_threads_here(ndat,nthreads)) then 671 call fftw3_fftpad_dp(fofr,nx,ny,nz,ldx,ldy,ldz,ndat,mgfft,-1,gboundout) 672 call cg_box2gsph(nx,ny,nz,ldx,ldy,ldz,ndat,npwout,kg_kout,fofr,fofgout) 673 else 674 !$OMP PARALLEL DO PRIVATE(ptg, ptr) 675 do dat=1,ndat 676 ptg = 1 + (dat-1)*npwout 677 ptr = 1 + (dat-1)*ldx*ldy*ldz 678 call fftw3_fftpad_dp(fofr(1,ptr),nx,ny,nz,ldx,ldy,ldz,ndat1,mgfft,-1,gboundout) 679 call cg_box2gsph(nx,ny,nz,ldx,ldy,ldz,ndat1,npwout,kg_kout,fofr(1,ptr),fofgout(1,ptg)) 680 end do 681 end if 682 683 CASE DEFAULT 684 write(msg,'(a,i0,a)')'Option',option,' is not allowed. Only option=0, 1, 2 or 3 are allowed presently.' 685 ABI_ERROR(msg) 686 END SELECT 687 688 689 #else 690 symm=0; symm(1,1)=1; symm(2,2)=1; symm(3,3)=1 691 use_fftbox = .FALSE. 692 #ifdef HAVE_OPENMP 693 use_fftbox = (ndat>1) 694 #endif 695 !use_fftbox = .TRUE. 696 697 SELECT CASE (option) 698 CASE (0) 699 ! 700 ! FFT u(g) --> u(r) 701 call sphere(fofgin,ndat,npwin,fofr,nx,ny,nz,ldx,ldy,ldz,kg_kin,istwf_k,1,me_g0,shiftg,symm,one) 702 703 if (use_fftbox) then 704 call fftw3_many_dft_ip(nx,ny,nz,ldx,ldy,ldz,ndat,ABI_FFTW_BACKWARD,fofr) 705 else 706 call fftw3_fftpad_dp(fofr,nx,ny,nz,ldx,ldy,ldz,ndat,mgfft,ABI_FFTW_BACKWARD,gboundin) 707 end if 708 709 CASE (1) 710 ! TODO this is delicate part to do in parallel, as one should OMP reduce denpot. 711 712 call sphere(fofgin,ndat,npwin,fofr,nx,ny,nz,ldx,ldy,ldz,kg_kin,istwf_k,1,me_g0,shiftg,symm,one) 713 714 if (use_fftbox) then 715 call fftw3_many_dft_ip(nx,ny,nz,ldx,ldy,ldz,ndat,ABI_FFTW_BACKWARD,fofr) 716 else 717 call fftw3_fftpad_dp(fofr,nx,ny,nz,ldx,ldy,ldz,ndat,mgfft,ABI_FFTW_BACKWARD,gboundin) 718 end if 719 720 call cg_addtorho(nx,ny,nz,ldx,ldy,ldz,ndat,weight_r,weight_i,fofr,denpot) 721 722 CASE (2) 723 724 call sphere(fofgin,ndat,npwin,fofr,nx,ny,nz,ldx,ldy,ldz,kg_kin,istwf_k,1,me_g0,shiftg,symm,one) 725 726 if (use_fftbox) then 727 call fftw3_many_dft_ip(nx,ny,nz,ldx,ldy,ldz,ndat,ABI_FFTW_BACKWARD,fofr) 728 else 729 call fftw3_fftpad_dp(fofr,nx,ny,nz,ldx,ldy,ldz,ndat,mgfft,ABI_FFTW_BACKWARD,gboundin) 730 end if 731 732 call cg_vlocpsi(nx,ny,nz,ldx,ldy,ldz,ndat,cplex,denpot,fofr) 733 734 ! The data for option==2 is now in fofr. 735 if (use_fftbox) then 736 call fftw3_many_dft_ip(nx,ny,nz,ldx,ldy,ldz,ndat,ABI_FFTW_FORWARD,fofr) 737 else 738 call fftw3_fftpad_dp(fofr,nx,ny,nz,ldx,ldy,ldz,ndat,mgfft,ABI_FFTW_FORWARD,gboundout) 739 end if 740 741 call cg_box2gsph(nx,ny,nz,ldx,ldy,ldz,ndat,npwout,kg_kout,fofr,fofgout) 742 743 CASE (3) 744 ! The data for option==3 is already in fofr. 745 call fftw3_fftpad_dp(fofr,nx,ny,nz,ldx,ldy,ldz,ndat,mgfft,ABI_FFTW_FORWARD,gboundout) 746 747 call cg_box2gsph(nx,ny,nz,ldx,ldy,ldz,ndat,npwout,kg_kout,fofr,fofgout) 748 749 CASE DEFAULT 750 write(msg,'(a,i0,a)')'Option',option,' is not allowed. Only option=0, 1, 2 or 3 are allowed presently.' 751 ABI_ERROR(msg) 752 END SELECT 753 #endif 754 end if 755 756 end subroutine fftw3_seqfourwf
m_fftw3/fftw3_set_nthreads [ Functions ]
[ Top ] [ m_fftw3 ] [ Functions ]
NAME
fftw3_set_nthreads
FUNCTION
This function sets the number of threads you want FFTW3 to use (or actually, the maximum number). It also performs any one-time initialization required to use FFTW3 threads. All plans subsequently created with any planner routine will use nthreads threads. If you pass an nthreads argument of 1 (the default), threads are disabled for subsequent plans. It does nothing if HAVE_FFT_FFTW3_THREADS is not defined.
INPUTS
[nthreads]=The number of threads you want FFTW3 to use. Default xomp_get_max_threads()
SOURCE
2330 subroutine fftw3_set_nthreads(nthreads) 2331 2332 !Arguments ------------------------------------ 2333 integer,optional,intent(in) :: nthreads 2334 2335 !Local variables ------------------------------ 2336 !scalars 2337 #ifdef HAVE_FFTW3_THREADS 2338 integer :: istat,nt 2339 integer,parameter :: enough=1 2340 integer,save :: nwarns=0 2341 #endif 2342 2343 ! ************************************************************************* 2344 2345 #ifdef HAVE_FFTW3_THREADS 2346 if (THREADS_INITED==0) then 2347 ABI_WARNING("Threads are not initialized") 2348 end if 2349 2350 if (PRESENT(nthreads)) then 2351 if (nthreads<=0) then 2352 nt = xomp_get_max_threads() 2353 else 2354 nt = nthreads 2355 end if 2356 else 2357 nt = xomp_get_max_threads() 2358 end if 2359 2360 call dfftw_plan_with_nthreads(nt) 2361 2362 #ifndef HAVE_OPENMP 2363 if (nwarns <= enough) then 2364 nwarns = nwarns + 1 2365 ABI_WARNING("Using FFTW3 with threads but HAVE_OPENMP is not defined!") 2366 end if 2367 #endif 2368 2369 #else 2370 if (PRESENT(nthreads)) then 2371 ABI_UNUSED(nthreads) 2372 end if 2373 #endif 2374 2375 end subroutine fftw3_set_nthreads
m_fftw3/fftw3_spawn_threads_here [ Functions ]
[ Top ] [ m_fftw3 ] [ Functions ]
NAME
fftw3_spawn_threads_here
FUNCTION
Helper function that returns true if FFT calls should be OMP parallelized in the client code.
INPUTS
ndat=Number of FFT transforms to do nthreads = Number of threads available
SOURCE
3039 function fftw3_spawn_threads_here(ndat,nthreads) result(ans) 3040 3041 !Arguments ------------------------------------ 3042 !scalars 3043 integer,intent(in) :: ndat,nthreads 3044 logical :: ans 3045 3046 ! ************************************************************************* 3047 3048 ans = .FALSE. 3049 #ifdef HAVE_OPENMP 3050 ans = (nthreads > 1 .and. MOD(ndat,nthreads) == 0 .and. .not. USE_LIB_THREADS) 3051 #else 3052 ABI_UNUSED((/ndat,nthreads/)) 3053 #endif 3054 3055 end function fftw3_spawn_threads_here
m_fftw3/fftw3_use_lib_threads [ Functions ]
[ Top ] [ m_fftw3 ] [ Functions ]
NAME
fftw3_use_lib_threads
FUNCTION
INPUTS
SOURCE
3070 subroutine fftw3_use_lib_threads(logvar) 3071 3072 !Arguments ------------------------------------ 3073 !scalars 3074 logical,intent(in) :: logvar 3075 3076 ! ************************************************************************* 3077 3078 USE_LIB_THREADS = logvar 3079 3080 end subroutine fftw3_use_lib_threads
m_fftw3/fftw3mpi_many_dft_ip [ Functions ]
[ Top ] [ m_fftw3 ] [ Functions ]
NAME
fftw3mpi_many_dft_ip
FUNCTION
Driver routine for many out-of-place 3D complex-to-complex FFTs of lengths nx, ny, nz.
INPUTS
nx,ny,nz=Number of points along the three directions. ldx,ldy,ldz=Physical dimension of the fin and fout arrays (to avoid cache conflicts). ndat=Number of FFTs to be done. fin(2*ldx*ldy*ldz*ndat)=The complex array to be transformed. isign=sign of Fourier transform exponent: current convention uses +1 for transforming from G to r, -1 for transforming from r to G. comm_fft=MPI communicator for the FFT [fftw_flags]=Flags used to create the plan. They can be combined with the "+" operator. Defaults to ABI_FFTW_ESTIMATE.
TODO
Remove me
OUTPUT
fout(2,ldx*ldy*ldz*ndat)=The Fourier transform of fin.
SOURCE
3211 subroutine fftw3mpi_many_dft_ip(nx,ny,nz,ldx,ldy,ldz,ndat,isign,fin,fout,comm_fft,fftw_flags) 3212 3213 !Arguments ------------------------------------ 3214 !scalars 3215 integer,intent(in) :: nx,ny,nz,ldx,ldy,ldz,ndat,isign,comm_fft 3216 integer,optional,intent(in) :: fftw_flags 3217 !arrays 3218 real(dp),intent(in) :: fin(2,ldx,ldy,ldz*ndat) 3219 real(dp),intent(out) :: fout(2,ldx,ldy,ldz*ndat) 3220 3221 #ifdef HAVE_FFTW3_MPI 3222 !Local variables------------------------------- 3223 !scalars 3224 integer,parameter :: rank3=3 3225 integer :: my_flags 3226 real(dp):: factor_fft 3227 !arrays 3228 type(C_PTR) :: plan, cdata 3229 complex(C_DOUBLE_COMPLEX), ABI_CONTIGUOUS pointer :: data(:,:,:) 3230 integer(C_INTPTR_T) :: i, j, k, alloc_local, local_n0, local_0_start,fft_sizes(4) 3231 3232 !************************************************************************* 3233 3234 my_flags=ABI_FFTW_ESTIMATE; if (PRESENT(fftw_flags)) my_flags= fftw_flags 3235 3236 ! get local data size and allocate (note dimension reversal) 3237 fft_sizes = [nz,ny,nx,ndat] 3238 3239 alloc_local = fftw_mpi_local_size_many( & 3240 & rank3,fft_sizes(1:3),fft_sizes(4),& 3241 & FFTW_MPI_DEFAULT_BLOCK, comm_fft, & 3242 & local_n0,local_0_start) 3243 3244 ! Allocate cdata, build the plane and copy data: fin --> data 3245 cdata = fftw_alloc_complex(alloc_local) 3246 call c_f_pointer(cdata, data, [fft_sizes(3),fft_sizes(2), local_n0]) 3247 3248 plan = fftw_mpi_plan_many_dft(rank3,fft_sizes(1:3),fft_sizes(4), & 3249 & FFTW_MPI_DEFAULT_BLOCK,FFTW_MPI_DEFAULT_BLOCK, & 3250 & data,data,comm_fft,isign,my_flags) 3251 3252 do k=1, local_n0*ndat 3253 do j=1, ny 3254 do i=1, nx 3255 data(i,j,k) = CMPLX( fin(1,i,j,k),fin(2,i,j,k),C_DOUBLE_COMPLEX) 3256 end do 3257 end do 3258 end do 3259 3260 ! Compute transform. 3261 call fftw_mpi_execute_dft(plan, data, data) 3262 3263 if(isign==ABI_FFTW_FORWARD) then 3264 ! Scale results. 3265 factor_fft = one / (nx*ny*nz) 3266 do k=1, local_n0*ndat 3267 do j=1, ny 3268 do i=1, nx 3269 fout(1,i,j,k) = real(data(i,j,k)) * factor_fft 3270 fout(2,i,j,k) = aimag(data(i,j,k)) * factor_fft 3271 end do 3272 end do 3273 end do 3274 end if 3275 3276 call fftw_destroy_plan(plan) 3277 call fftw_free(cdata) 3278 3279 #else 3280 ABI_ERROR("FFTW3_MPI support not activated") 3281 ABI_UNUSED((/nx,ny,nz,ldx,ldy,ldz,ndat,isign/)) 3282 ABI_UNUSED(comm_fft) 3283 if (PRESENT(fftw_flags)) then 3284 ABI_UNUSED(fftw_flags) 3285 end if 3286 ABI_UNUSED(fin(1,1,1,1)) 3287 ABI_UNUSED(fout(1,1,1,1)) 3288 #endif 3289 3290 end subroutine fftw3mpi_many_dft_ip
m_fftw3/fftw3mpi_many_dft_tr [ Functions ]
[ Top ] [ m_fftw3 ] [ Functions ]
NAME
fftw3mpi_many_dft_tr
FUNCTION
Driver routine for many out-of-place 3D complex-to-complex FFTs of lengths nx, ny, nz.
INPUTS
nx,ny,nz=Number of points along the three directions. ldx,ldy,ldz=Physical dimension of the fin and fout arrays (to avoid cache conflicts). ndat=Number of FFTs to be done. fin(2*ldx*ldy*ldz*ndat)=The complex array to be transformed. isign=sign of Fourier transform exponent: current convention uses +1 for transforming from G to r, -1 for transforming from r to G. comm_fft=MPI communicator for the FFT. [fftw_flags]=Flags used to create the plan. They can be combined with the "+" operator. Defaults to ABI_FFTW_ESTIMATE.
TODO
Remove me
OUTPUT
fout(2,ldx*ldy*ldz*ndat)=The Fourier transform of fin.
SOURCE
3322 subroutine fftw3mpi_many_dft_tr(nx,ny,nz,ndat,isign,fin,fout,comm_fft,fftw_flags) 3323 3324 !Arguments ------------------------------------ 3325 !scalars 3326 integer,intent(in) :: nx,ny,nz,ndat,isign,comm_fft 3327 integer,optional,intent(in) :: fftw_flags 3328 !arrays 3329 complex(C_DOUBLE_COMPLEX),ABI_CONTIGUOUS pointer :: fin(:,:,:) 3330 complex(C_DOUBLE_COMPLEX),ABI_CONTIGUOUS pointer :: fout(:,:,:) 3331 3332 !Local variables------------------------------- 3333 #ifdef HAVE_FFTW3_MPI 3334 !scalars 3335 integer :: my_flags 3336 !FFTWMPI stuff 3337 type(C_PTR) :: plan 3338 integer(C_INTPTR_T) :: fft_sizes(4) 3339 3340 !************************************************************************* 3341 3342 my_flags=ABI_FFTW_ESTIMATE; if (PRESENT(fftw_flags)) my_flags= fftw_flags 3343 my_flags = ior(my_flags,FFTW_DESTROY_INPUT) 3344 3345 fft_sizes(1)=nz 3346 fft_sizes(2)=ny 3347 fft_sizes(3)=nx 3348 fft_sizes(4)=ndat 3349 3350 plan = fftw_mpi_plan_many_dft(3,fft_sizes(1:3),fft_sizes(4), & 3351 & FFTW_MPI_DEFAULT_BLOCK,FFTW_MPI_DEFAULT_BLOCK, & 3352 & fin,fout,comm_fft,isign,my_flags) 3353 3354 !Compute transform (as many times as desired) 3355 call fftw_mpi_execute_dft(plan, fin, fout) 3356 call fftw_destroy_plan(plan) 3357 3358 #else 3359 ABI_ERROR("FFTW3_MPI support not activated") 3360 ABI_UNUSED((/nx,ny,nz,ndat,isign,comm_fft/)) 3361 if (PRESENT(fftw_flags)) then 3362 ABI_UNUSED(fftw_flags) 3363 end if 3364 ABI_UNUSED(fin(1,1,1)) 3365 ABI_UNUSED(fout(1,1,1)) 3366 #endif 3367 3368 end subroutine fftw3mpi_many_dft_tr
m_fftw3/fftwmpi_free_work_array [ Functions ]
[ Top ] [ m_fftw3 ] [ Functions ]
NAME
fftwmpi_free_work_array
FUNCTION
routine for freeing fftw work arrray
INPUTS
OUTPUT
cdata_f,cdata_r: C pointers to free for fourier andreal data
SOURCE
3161 subroutine fftwmpi_free_work_array(cdata_f,cdata_r) 3162 3163 !Arguments ------------------------------------ 3164 !scalars 3165 type(C_PTR), intent(inout) :: cdata_f,cdata_r 3166 3167 ! ************************************************************************* 3168 3169 #ifdef HAVE_FFTW3_MPI 3170 call fftw_free(cdata_r) 3171 call fftw_free(cdata_f) 3172 #else 3173 ABI_ERROR("FFTW3_MPI support not activated") 3174 if(.false.) then 3175 cdata_r = C_NULL_PTR; cdata_f = C_NULL_PTR 3176 end if 3177 #endif 3178 3179 end subroutine fftwmpi_free_work_array
m_fftw3/fftwmpi_get_work_array [ Functions ]
[ Top ] [ m_fftw3 ] [ Functions ]
NAME
fftwmpi_get_work_array
FUNCTION
Driver routine for allocate fftw work arrray for 3D complex-to-complex FFTs of lengths nx, ny, nz.
INPUTS
nx,ny,nz=Number of points along the three directions. ndat=Number of FFTs to be done. comm_fft=MPI communicator.
OUTPUT
cdata_f,cdata_r: C pointers to use for fourier andreal data n0,n0_tr : local size on the shared dimension (nz or ny if transposed mode is used) offset,offset_tr : offset per process in continuous tabx
SOURCE
3104 subroutine fftwmpi_get_work_array(cdata_f,cdata_r,rank,nx,ny,nz,ndat,comm_fft,n0,offset,n0_tr,offset_tr) 3105 3106 !Arguments ------------------------------------ 3107 !scalars 3108 integer,intent(in) :: nx,ny,nz,ndat,rank,comm_fft 3109 integer(C_INTPTR_T), intent(out) :: n0, offset, n0_tr, offset_tr 3110 type(C_PTR), intent(out) :: cdata_f,cdata_r 3111 3112 !Local variables------------------------------- 3113 #ifdef HAVE_FFTW3_MPI 3114 !scalars 3115 integer(C_INTPTR_T) :: alloc_local 3116 !arrays 3117 integer(C_INTPTR_T) :: fft_sizes(4) 3118 3119 ! ************************************************************************* 3120 3121 ! Dimensions are inverted here (C interface). 3122 fft_sizes(1)=nz 3123 fft_sizes(2)=ny 3124 fft_sizes(3)=nx 3125 fft_sizes(4)=ndat 3126 3127 alloc_local = fftw_mpi_local_size_many_transposed(rank,fft_sizes(1:3),fft_sizes(4), & 3128 & FFTW_MPI_DEFAULT_BLOCK,FFTW_MPI_DEFAULT_BLOCK, comm_fft, & 3129 & n0,offset, & 3130 & n0_tr,offset_tr) 3131 3132 cdata_f = fftw_alloc_complex(alloc_local) 3133 cdata_r = fftw_alloc_complex(alloc_local) 3134 3135 #else 3136 ABI_ERROR("FFTW3_MPI support not activated") 3137 ABI_UNUSED((/nx,ny,nz,ndat,rank,comm_fft/)) 3138 cdata_f = C_NULL_PTR; cdata_r = C_NULL_PTR 3139 n0 = 0; offset = 0; n0_tr = 0; offset_tr = 0 3140 #endif 3141 3142 end subroutine fftwmpi_get_work_array
m_fftw3/old_fftw3_mpifourdp [ Functions ]
[ Top ] [ m_fftw3 ] [ Functions ]
NAME
old_fftw3_mpifourdp
FUNCTION
INPUTS
cplex=1 if fofr is real, 2 if fofr is complex nfft=(effective) number of FFT grid points (for this processor) ndat=Number of FFTs to be done. isign= +1 : fofg(G) => fofr(R); -1 : fofr(R) => fofg(G) fftn2_distrib(n2)= rank of the processor which own fft planes in 2nd dimension for fourdp ffti2_local(n2) = local i2 indices in fourdp fftn3_distrib(n3) = rank of the processor which own fft planes in 3rd dimension for fourdp ffti3_local(n3) = local i3 indices in fourdp comm_fft=MPI communicator for the FFT [fftw_flags]=Flags used to create the plan. They can be combined with the "+" operator.
SIDE EFFECTS
Input/Output fofg(2,nfft*ndat)=f(G), complex. fofr(cplex*nfft*ndat)=input function f(r) (real or complex)
SOURCE
3685 subroutine old_fftw3_mpifourdp(cplex,nfft,ngfft,ndat,isign,& 3686 fftn2_distrib,ffti2_local,fftn3_distrib,ffti3_local,fofg,fofr,comm_fft,fftw_flags) 3687 3688 !Arguments ------------------------------------ 3689 !scalars 3690 integer,intent(in) :: cplex,nfft,ndat,isign,comm_fft 3691 integer,optional,intent(in) :: fftw_flags 3692 !arrays 3693 integer,intent(in) :: ngfft(18) 3694 integer,intent(in) :: fftn2_distrib(ngfft(2)),ffti2_local(ngfft(2)) 3695 integer,intent(in) :: fftn3_distrib(ngfft(3)),ffti3_local(ngfft(3)) 3696 real(dp),intent(inout) :: fofg(2,nfft*ndat),fofr(cplex*nfft*ndat) 3697 3698 #ifdef HAVE_FFTW3_MPI 3699 !Local variables------------------------------- 3700 !scalars 3701 integer :: nx,ny,nz,my_flags 3702 3703 ! ************************************************************************* 3704 3705 my_flags=ABI_FFTW_ESTIMATE; if (PRESENT(fftw_flags)) my_flags= fftw_flags 3706 3707 nx=ngfft(1); ny=ngfft(2); nz=ngfft(3) 3708 !me_fft=ngfft(11); nproc_fft=ngfft(10) 3709 3710 select case (cplex) 3711 3712 case (1) 3713 3714 ! Complex to Complex. 3715 ! This one is ok when ndat > 1 3716 !call fftw3_mpifourdp_c2c(cplex,nfft,ngfft,ndat,isign,& 3717 !& fftn2_distrib,ffti2_local,fftn3_distrib,ffti3_local,fofg,fofr,comm_fft,fftw_flags=my_flags) 3718 !return 3719 3720 ! r2c or c2r case. 3721 ! FIXME this one is buggy when ndat > 1 3722 select case (isign) 3723 case (ABI_FFTW_FORWARD) 3724 ! +1; R --> G 3725 call fftw3_mpifourdp_r2c(nfft,ngfft,ndat,fftn2_distrib,ffti2_local,fftn3_distrib,ffti3_local,& 3726 & fofg,fofr,comm_fft,fftw_flags=my_flags) 3727 3728 case (ABI_FFTW_BACKWARD) 3729 ! -1; G --> R 3730 call fftw3_mpifourdp_c2r(nfft,ngfft,ndat,fftn2_distrib,ffti2_local,fftn3_distrib,ffti3_local,& 3731 & fofg,fofr,comm_fft,fftw_flags=my_flags) 3732 3733 case default 3734 ABI_BUG("Wrong isign") 3735 end select 3736 3737 case (2) 3738 ! Complex to Complex. 3739 call fftw3_mpifourdp_c2c(cplex,nfft,ngfft,ndat,isign,& 3740 & fftn2_distrib,ffti2_local,fftn3_distrib,ffti3_local,fofg,fofr,comm_fft,fftw_flags=my_flags) 3741 3742 case default 3743 ABI_BUG(" Wrong value for cplex") 3744 end select 3745 3746 #else 3747 ABI_ERROR("FFTW3_MPI support not activated") 3748 ABI_UNUSED((/cplex,nfft,ngfft(1),ndat,isign,comm_fft/)) 3749 ABI_UNUSED((/fftn2_distrib(1),ffti2_local(1)/)) 3750 ABI_UNUSED((/fftn3_distrib(1),ffti3_local(1)/)) 3751 if (PRESENT(fftw_flags)) then 3752 ABI_UNUSED(fftw_flags) 3753 end if 3754 ABI_UNUSED(fofg(1,1)) 3755 ABI_UNUSED(fofr(1)) 3756 #endif 3757 3758 end subroutine old_fftw3_mpifourdp
m_fftw3/zplan_many_dft [ Functions ]
[ Top ] [ m_fftw3 ] [ Functions ]
NAME
FUNCTION
INPUTS
SIDE EFFECTS
SOURCE
2661 !! FIXME technically it should be intent(inout) since FFTW3 can destroy the input for particular flags. 2662 2663 function zplan_many_dft(rank,n,howmany,fin,inembed,istride,idist,fout,onembed,ostride,odist,sign,flags,nthreads) result(plan) 2664 2665 !Arguments ------------------------------------ 2666 !scalars 2667 integer,intent(in) :: rank,howmany,istride,ostride, sign,flags,idist,odist,nthreads 2668 integer,intent(in) :: n(rank),inembed(rank),onembed(rank) 2669 integer(KIND_FFTW_PLAN) :: plan 2670 !arrays 2671 complex(dpc) :: fin(*),fout(*) 2672 2673 !Local variables------------------------------- 2674 character(len=500) :: msg,frmt 2675 2676 ! ************************************************************************* 2677 2678 !$OMP CRITICAL (OMPC_zplan_many_dft) 2679 call fftw3_set_nthreads(nthreads) 2680 2681 call dfftw_plan_many_dft(plan, rank, n, howmany, & 2682 & fin, inembed, istride, idist, fout, onembed, ostride, odist, sign, flags) 2683 !$OMP END CRITICAL (OMPC_zplan_many_dft) 2684 2685 if (plan==NULL_PLAN) then ! handle the error 2686 call wrtout(std_out, "dfftw_plan_many_dft returned NULL_PLAN (complex version)") 2687 write(frmt,*)"(a,",rank,"(1x,i0),3(a,i0),a,2(a,",rank,"(1x,i0),2(a,i0),a))" 2688 write(msg,frmt)& 2689 & " n = ",n," howmany = ",howmany," sign = ",sign," flags = ",flags,ch10,& 2690 & " inembed = ",inembed," istride = ",istride," idist =",idist,ch10, & 2691 & " onembed = ",onembed," ostride = ",ostride," odist =",idist,ch10 2692 call wrtout(std_out, msg) 2693 ABI_ERROR("Check FFTW library and/or abinit code") 2694 end if 2695 2696 end function zplan_many_dft
m_m_fftw3/fftw3_mpifourdp [ Functions ]
NAME
fftw3_mpifourdp
FUNCTION
Conduct Fourier transform of REAL or COMPLEX function f(r)=fofr defined on fft grid in real space, to create complex f(G)=fofg defined on full fft grid in reciprocal space, in full storage mode, or the reverse operation. For the reverse operation, the final data is divided by nfftot. REAL case when cplex=1, COMPLEX case when cplex=2 Usually used for density and potentials.
INPUTS
cplex=1 if fofr is real, 2 if fofr is complex nfft=(effective) number of FFT grid points (for this processor) ngfft(18)=contain all needed information about 3D FFT, see ~abinit/doc/variables/vargs.htm#ngfft ndat=Numbre of FFT transforms isign=sign of Fourier transform exponent: current convention uses +1 for transforming from G to r -1 for transforming from r to G. fftn2_distrib(2),ffti2_local(2) fftn3_distrib(3),ffti3_local(3) comm_fft=MPI communicator
SIDE EFFECTS
Input/Output fofg(2,nfft)=f(G), complex. fofr(cplex*nfft)=input function f(r) (real or complex)
SOURCE
5216 subroutine fftw3_mpifourdp(cplex,nfft,ngfft,ndat,isign,& 5217 & fftn2_distrib,ffti2_local,fftn3_distrib,ffti3_local,fofg,fofr,comm_fft) 5218 5219 !Arguments ------------------------------------ 5220 !scalars 5221 integer,intent(in) :: cplex,isign,nfft,ndat,comm_fft 5222 !arrays 5223 integer,intent(in) :: ngfft(18) 5224 integer,intent(in) :: fftn2_distrib(ngfft(2)),ffti2_local(ngfft(2)) 5225 integer,intent(in) :: fftn3_distrib(ngfft(3)),ffti3_local(ngfft(3)) 5226 real(dp),intent(inout) :: fofg(2,nfft*ndat),fofr(cplex*nfft*ndat) 5227 5228 !Local variables------------------------------- 5229 !scalars 5230 integer :: n1,n2,n3,n4,n5,n6,nd2proc,nd3proc,nproc_fft,me_fft 5231 !arrays 5232 real(dp),allocatable :: workf(:,:,:,:,:),workr(:,:,:,:,:) 5233 5234 ! ************************************************************************* 5235 5236 ! Note the only c2c is supported in parallel. 5237 n1=ngfft(1); n2=ngfft(2); n3=ngfft(3) 5238 n4=ngfft(4); n5=ngfft(5); n6=ngfft(6) 5239 me_fft=ngfft(11); nproc_fft=ngfft(10) 5240 5241 nd2proc=((n2-1)/nproc_fft) +1 5242 nd3proc=((n6-1)/nproc_fft) +1 5243 ABI_MALLOC(workr,(2,n4,n5,nd3proc,ndat)) 5244 ABI_MALLOC(workf,(2,n4,n6,nd2proc,ndat)) 5245 5246 ! Complex to Complex 5247 ! TODO: Complex to Real 5248 select case (isign) 5249 case (1) 5250 ! G --> R 5251 call mpifft_fg2dbox(nfft,ndat,fofg,n1,n2,n3,n4,nd2proc,n6,fftn2_distrib,ffti2_local,me_fft,workf) 5252 5253 call fftw3_mpiback(2,ndat,n1,n2,n3,n4,n5,n6,n4,nd2proc,nd3proc,2,workf,workr,comm_fft) 5254 5255 call mpifft_dbox2fr(n1,n2,n3,n4,n5,nd3proc,ndat,fftn3_distrib,ffti3_local,me_fft,workr,cplex,nfft,fofr) 5256 5257 case (-1) 5258 ! R --> G 5259 call mpifft_fr2dbox(cplex,nfft,ndat,fofr,n1,n2,n3,n4,n5,nd3proc,fftn3_distrib,ffti3_local,me_fft,workr) 5260 5261 call fftw3_mpiforw(2,ndat,n1,n2,n3,n4,n5,n6,n4,nd2proc,nd3proc,2,workr,workf,comm_fft) 5262 5263 ! Transfer FFT output to the original fft box. 5264 call mpifft_dbox2fg(n1,n2,n3,n4,nd2proc,n6,ndat,fftn2_distrib,ffti2_local,me_fft,workf,nfft,fofg) 5265 5266 case default 5267 ABI_BUG("Wrong isign") 5268 end select 5269 5270 ABI_FREE(workr) 5271 ABI_FREE(workf) 5272 5273 end subroutine fftw3_mpifourdp