TABLE OF CONTENTS


ABINIT/ccfft [ Functions ]

[ Top ] [ Functions ]

NAME

 ccfft

FUNCTION

 Carry out complex-to-complex Fourier transforms between real
 and reciprocal (G) space. Library of such routines.
 Include machine-dependent F90 routines used with fftalg=200.

INPUTS

  fftalga=govern the choice of the fft routine to be used
    if 1: SGoedecker routine
    if 2: Machine dependent routine, depending on the precompilation options
    if 3: FFTW library routine
    if 4: new SGoedecker routine, version 2002
          Warning : the second and third dimensions of the Fourier space
          array are switched, compared to the usual case
  fftcache=size of the cache (kB)
  isign= Integer specifying which sign to be used for the transformation.
         must be either +1 or -1.
  ngfft(18)=contain all needed information about 3D FFT, see ~abinit/doc/variables/vargs.htm#ngfft
  n1,n2,n3=Actual integer dimensions (see ngfft) for the 3D sequence.
           Physical dimension of the transform.
  n4,n5,n6=Leading dimensions. Generally, n6 is not different to n3.
  ndat=number of FFT to do in //
  option= 1 if call from fourwf, 2 if call from other routine
  work1(2,n4*n5*n6)=Array to be transformed.

OUTPUT

  inplace = 0 if result is in work2 ; =1 if result is in work1 (machine dependent)
  normalized=0 if the backward (isign=-1) FFT is not normalized, so has to be normalized outside of ccfft
            =1 otherwise
  work2(2,n4*n5*n6)=transformed array in case inplace=0.

SIDE EFFECTS

  work1(2,n4*n5*n6)=at input, array to be transformed
                    at output, transformed array (in case inplace=1)

NOTES

 precompilation definitions :
   -D(machine_list) :  (case fftalga=200)
      choice of machine-dependent FFT library, if permitted
   -DHAVE_FFT_FFTW2   : (case fftalga=300) activate the FFTW lib
   -Dnolib  : (case fftalga=200) call SGoedecker routine,
      instead of machine-dependent one

 More about fftalga=200
 Library routines for the following platforms have been implemented :
  Compaq/DEC
  HP          (in place FFT)
  SGI         (in place FFT)
  NEC         (in place FFT)
 For all the other platforms, or if the CPP directive nolib is
 activated, one uses the fft routine from S. Goedecker.

SOURCE

3371 subroutine ccfft(ngfft,isign,n1,n2,n3,n4,n5,n6,ndat,option,work1,work2,comm_fft)
3372 
3373 
3374 !Arguments ------------------------------------
3375 !scalars
3376  integer,intent(in) :: isign,n1,n2,n3,n4,n5,n6,ndat,option,comm_fft
3377 !arrays
3378  integer,intent(in) :: ngfft(18)
3379  real(dp),intent(inout) :: work1(2,n4*n5*n6*ndat)
3380  real(dp),intent(inout) :: work2(2,n4*n5*n6*ndat) !vz_i
3381 
3382 !Local variables ------------------------------
3383 !scalars
3384  integer,parameter :: cplex2=2
3385  integer :: fftalg,fftalga,fftalgb,fftalgc,fftcache
3386  integer :: nd2proc,nd3proc,nproc_fft
3387  character(len=500) :: msg
3388 
3389 !*************************************************************************
3390 
3391  !print *, "in ccfft"
3392  nproc_fft=ngfft(10)
3393  fftcache=ngfft(8); fftalg  =ngfft(7); fftalga =fftalg/100; fftalgb=mod(fftalg,100)/10; fftalgc=mod(fftalg,10)
3394 
3395  if(fftalga==2)then
3396    ABI_ERROR("Machine dependent FFTs are not supported anymore")
3397 
3398  else if(fftalga==3)then
3399    ABI_ERROR("Old interface with FFTW2 is not supported anymore")
3400 
3401  else if(fftalga<1 .or. fftalga>4)then
3402    write(msg, '(a,a,a,i5,a,a)' )&
3403     'The allowed values of fftalg(A) are 1, 2, 3, and 4 .',ch10,&
3404     'The actual value of fftalg(A) is',fftalga,ch10,&
3405     'Action: check the value of fftalg in your input file.'
3406    ABI_ERROR(msg)
3407  end if
3408 
3409  ! This routine will be removed ASAP.
3410  ! Do not add new FFT libraries without previous discussion with Matteo Giantomassi
3411  ! inplace==1 or normalize==2 are not supported anymore in the caller (fourwf, fourdp)
3412  !inplace=0; normalized=0
3413 
3414  if (fftalga/=4) then
3415    ! Call Stefan Goedecker FFT
3416    call sg_fft_cc(fftcache,n1,n2,n3,n4,n5,n6,ndat,isign,work1,work2)
3417 
3418  else if (fftalga==4) then
3419    ! Call new version of Stefan Goedecker FFT
3420    nd2proc=((n2-1)/nproc_fft) +1
3421    nd3proc=((n6-1)/nproc_fft) +1
3422 
3423    if (isign==1) then
3424      ! Fourier to real space (backward)
3425      call sg2002_back(cplex2,ndat,n1,n2,n3,n4,n5,n6,n4,nd2proc,nd3proc,option,work1,work2,comm_fft)
3426    else
3427      ! isign=-1, real space to Fourier (forward)
3428      call sg2002_forw(cplex2,ndat,n1,n2,n3,n4,n5,n6,n4,nd2proc,nd3proc,option,work1,work2,comm_fft)
3429    end if
3430  end if
3431 
3432 end subroutine ccfft

ABINIT/fft_init_counters [ Functions ]

[ Top ] [ Functions ]

NAME

 fft_init_counters

FUNCTION

SOURCE

4673 subroutine fft_init_counters()
4674 
4675    fourdp_counter = 0
4676    fourwf_counter = 0
4677 
4678 end subroutine fft_init_counters

ABINIT/fft_output_counters [ Functions ]

[ Top ] [ Functions ]

NAME

 fft_output_counters

FUNCTION

SOURCE

4705 subroutine fft_output_counters(nbandtot, mpi_enreg)
4706 
4707 !Arguments ------------------------------------
4708  integer,intent(in) :: nbandtot
4709  type(MPI_type),intent(in) :: mpi_enreg
4710 
4711 !Local variables-------------------------------
4712 !scalars
4713  character(len=500) :: msg
4714  integer :: cnt,ierr
4715 !arrays
4716 
4717  call wrtout([std_out,ab_out],'')
4718  write(msg,'(a)')                ' --- FFT COUNTERS ------------------------------------------------------------'
4719  call wrtout([std_out,ab_out], msg)
4720  write(msg,'(a,i6)')             ' total Number of Bands         : NB = ',nbandtot
4721  call wrtout([std_out,ab_out], msg)
4722  write(msg,'(a)')                '                      | total count (TC) |            TC/NB'
4723  call wrtout([std_out,ab_out], msg)
4724  write(msg,'(a)')                ' -----------------------------------------------------------------------------'
4725  call wrtout([std_out,ab_out], msg)
4726  call xmpi_sum(fourwf_counter,mpi_enreg%comm_kpt,ierr)
4727  cnt=fourdp_counter
4728  if (cnt>0) then
4729    write(msg,'(a,i16,a)')       ' fourdp               | ',cnt,' |'
4730    call wrtout([std_out,ab_out], msg)
4731  end if
4732  cnt=fourwf_counter
4733  if (cnt>0) then
4734    write(msg,'(a,i16,a,f16.1)') ' fourwf               | ',cnt,' | ',dble(cnt)/nbandtot
4735    call wrtout([std_out,ab_out], msg)
4736  end if
4737  write(msg,'(a)')                ' -----------------------------------------------------------------------------'
4738  call wrtout([std_out,ab_out], msg)
4739 
4740 end subroutine fft_output_counters

ABINIT/fft_stop_counters [ Functions ]

[ Top ] [ Functions ]

NAME

 fft_stop_counters

FUNCTION

SOURCE

4689 subroutine fft_stop_counters()
4690 
4691    fourdp_counter = -1
4692    fourwf_counter = -1
4693 
4694 end subroutine fft_stop_counters

ABINIT/fourdp [ Functions ]

[ Top ] [ Functions ]

NAME

 fourdp

FUNCTION

 Conduct Fourier transform of REAL or COMPLEX function f(r)=fofr defined on
 fft grid in real space, to create complex f(G)=fofg defined on full fft grid
 in reciprocal space, in full storage mode, or the reverse operation.
 For the reverse operation, the final data is divided by nfftot.
 REAL case when cplex=1, COMPLEX case when cplex=2
 Usually used for density and potentials.

 There are two different possibilities:
  fftalgb=0 means using the complex-to-complex FFT routine, irrespective of the value of cplex
  fftalgb=1 means using a real-to-complex FFT or a complex-to-complex FFT, depending on the value of cplex.
  The only real-to-complex FFT available is from SGoedecker library.

INPUTS

 cplex=1 if fofr is real, 2 if fofr is complex
 isign=sign of Fourier transform exponent: current convention uses
  +1 for transforming from G to r
  -1 for transforming from r to G.
 mpi_enreg=information about MPI parallelization
 nfft=(effective) number of FFT grid points (for this processor)
 ndat=Number of functions to transform
 ngfft(18)=contain all needed information about 3D FFT, see ~abinit/doc/variables/vargs.htm#ngfft
 tim_fourdp=timing code of the calling routine (can be set to 0 if not attributed)

SIDE EFFECTS

 Input/Output
 fofg(2,nfft)=f(G), complex.
 fofr(cplex*nfft)=input function f(r) (real or complex)

SOURCE

2972 subroutine fourdp(cplex, fofg, fofr, isign, mpi_enreg, nfft, ndat, ngfft, tim_fourdp)
2973 
2974 !Arguments ------------------------------------
2975 !scalars
2976  integer,intent(in) :: cplex,isign,nfft,ndat,tim_fourdp
2977  type(MPI_type),intent(in) :: mpi_enreg
2978 !arrays
2979  integer,intent(in) :: ngfft(18)
2980  real(dp),intent(inout) :: fofg(2,nfft,ndat),fofr(cplex*nfft,ndat)
2981 
2982 !Local variables-------------------------------
2983 !scalars
2984  integer :: fftalg,fftalga,fftalgb,fftcache,i1,i2,i3,base,idat
2985  integer :: n1,n1half1,n1halfm,n2,n2half1,n3,n4
2986  integer :: n4half1,n5,n5half1,n6 !nd2proc,nd3proc,i3_local,i2_local,
2987  integer :: comm_fft,nproc_fft,me_fft
2988  real(dp) :: xnorm
2989  character(len=500) :: msg
2990 !arrays
2991  integer, ABI_CONTIGUOUS pointer :: fftn2_distrib(:),ffti2_local(:)
2992  integer, ABI_CONTIGUOUS pointer :: fftn3_distrib(:),ffti3_local(:)
2993  real(dp) :: tsec(2)
2994  real(dp),allocatable :: work1(:,:,:,:,:),work2(:,:,:,:,:)
2995  real(dp),allocatable :: workf(:,:,:,:,:),workr(:,:,:,:,:)
2996 
2997 ! *************************************************************************
2998 
2999  ABI_CHECK(ndat == 1, "ndat != 1 should be tested")
3000 
3001  ! Keep track of timing
3002  call timab(1260+tim_fourdp,1,tsec)
3003 
3004  if (fourdp_counter>=0) then
3005    fourdp_counter = fourdp_counter + ndat
3006  end if
3007 
3008  n1=ngfft(1); n2=ngfft(2); n3=ngfft(3)
3009  n4=ngfft(4); n5=ngfft(5); n6=ngfft(6)
3010  me_fft=ngfft(11); nproc_fft=ngfft(10)
3011  comm_fft = mpi_enreg%comm_fft
3012  !write(std_out,*)"fourdp, nx,ny,nz,nfft =",n1,n2,n3,nfft
3013 
3014  fftcache=ngfft(8)
3015  fftalg  =ngfft(7); fftalga =fftalg/100; fftalgb =mod(fftalg,100)/10
3016 
3017  xnorm=one/dble(n1*n2*n3)
3018  !write(std_out,*)' fourdp :me_fft',me_fft,'nproc_fft',nproc_fft,'nfft',nfft
3019 
3020  if (fftalgb /= 0 .and. fftalgb /= 1) then
3021    write(msg, '(a,i0,5a)' )&
3022     'The input algorithm number fftalg= ',fftalg,' is not allowed.',ch10,&
3023     'The second digit (fftalg(B)) must be 0 or 1.',ch10,&
3024     'Action: change fftalg in your input file.'
3025    ABI_BUG(msg)
3026  end if
3027 
3028  if (fftalgb == 1 .and. ALL(fftalga /= [1,3,4,5])) then
3029    write(msg,'(a,i0,5a)')&
3030     'The input algorithm number fftalg= ',fftalg,' is not allowed.',ch10,&
3031     'When fftalg(B) is 1, the allowed values for fftalg(A) are 1 and 4.',ch10,&
3032     'Action: change fftalg in your input file.'
3033    ABI_BUG(msg)
3034  end if
3035 
3036  if (n4<n1.or.n5<n2.or.n6<n3) then
3037    write(msg,'(a,3(i0,1x),a,3(i0,1x))')'  Each of n4,n5,n6=',n4,n5,n6,'must be >= n1, n2, n3 =',n1,n2,n3
3038    ABI_BUG(msg)
3039  end if
3040 
3041  ! Get the distrib associated with this fft_grid => for i2 and i3 planes
3042  call ptabs_fourdp(mpi_enreg,n2,n3,fftn2_distrib,ffti2_local,fftn3_distrib,ffti3_local)
3043 
3044  ! Branch immediately depending on nproc_fft
3045  if (nproc_fft > 1) then
3046    call fourdp_mpi(cplex,nfft,ngfft,ndat,isign,fftn2_distrib,ffti2_local,fftn3_distrib,ffti3_local,fofg,fofr,comm_fft)
3047    goto 100
3048  end if
3049 
3050  if (fftalga == FFT_FFTW3) then
3051    ! Call sequential or MPI FFTW3 version.
3052    if (nproc_fft == 1) then
3053      !call wrtout(std_out,"FFTW3 SEQFOURDP")
3054      call fftw3_seqfourdp(cplex,n1,n2,n3,n1,n2,n3,ndat,isign,fofg,fofr)
3055    else
3056      !call wrtout(std_out,"FFTW3 MPIFOURDP")
3057      call fftw3_mpifourdp(cplex,nfft,ngfft,ndat,isign,&
3058       fftn2_distrib,ffti2_local,fftn3_distrib,ffti3_local,fofg,fofr,comm_fft)
3059    end if
3060    ! Accumulate timing and return
3061    call timab(1260+tim_fourdp,2,tsec); return
3062  end if
3063 
3064  if (fftalga == FFT_DFTI) then
3065    ! Call sequential or MPI MKL.
3066    if (nproc_fft == 1) then
3067      call dfti_seqfourdp(cplex,n1,n2,n3,n1,n2,n3,ndat,isign,fofg,fofr)
3068    else
3069      ABI_ERROR("MPI fourdp with MKL cluster DFT not implemented")
3070    end if
3071    ! Accumulate timing and return
3072    call timab(1260+tim_fourdp,2,tsec); return
3073  end if
3074 
3075  ! Here, deal with the new SG FFT, complex-to-complex case
3076  if (fftalga==FFT_SG2002 .and. (fftalgb==0 .or. cplex==2)) then
3077    call sg2002_mpifourdp(cplex,nfft,ngfft,ndat,isign,fftn2_distrib,ffti2_local,fftn3_distrib,ffti3_local,fofg,fofr,comm_fft)
3078    !call sg2002_seqfourdp(cplex,nfft,ngfft,ndat,isign,fftn2_fofg,fofr)
3079  end if
3080 
3081  ! Here, deal with the new SG FFT, with real-to-complex
3082  if (fftalga==FFT_SG2002 .and. fftalgb==1 .and. cplex==1) then
3083    ABI_CHECK(nproc_fft == 1,"fftalg 41x does not support nproc_fft > 1")
3084    ABI_CHECK(ndat == 1, "ndat must be 1")
3085 
3086    n1half1=n1/2+1; n1halfm=(n1+1)/2
3087    n2half1=n2/2+1
3088    ! n4half1 or n5half1 are the odd integers >= n1half1 or n2half1
3089    n4half1=(n1half1/2)*2+1
3090    n5half1=(n2half1/2)*2+1
3091    ABI_MALLOC(workr, (2,n4half1,n5,n6,ndat))
3092    ABI_MALLOC(workf, (2,n4,n6,n5half1,ndat))
3093 
3094    if (isign==1) then
3095      do idat=1,ndat
3096        do i3=1,n3
3097          do i2=1,n2half1
3098            base=n1*(i2-1+n2*(i3-1))
3099            do i1=1,n1
3100              workf(1,i1,i3,i2,idat) = fofg(1,i1+base,idat)
3101              workf(2,i1,i3,i2,idat) = fofg(2,i1+base,idat)
3102            end do
3103          end do
3104        end do
3105      end do
3106 
3107      !nd2proc=((n2-1)/nproc_fft) +1
3108      !nd3proc=((n6-1)/nproc_fft) +1
3109 
3110      ! change the call? n5half1 et n6 ?
3111      call sg2002_back(cplex,ndat,n1,n2,n3,n4,n5,n6,n4half1,n5half1,n6,2,workf,workr,comm_fft)
3112 
3113      do idat=1,ndat
3114        do i3=1,n3
3115          do i2=1,n2
3116            base=n1*(i2-1+n2*(i3-1))
3117            do i1=1,n1half1-1
3118              ! copy data
3119              fofr(2*i1-1+base, idat) = workr(1,i1,i2,i3,idat)
3120              fofr(2*i1  +base, idat) = workr(2,i1,i2,i3,idat)
3121            end do
3122            ! If n1 odd, must add last data
3123            if((2*n1half1-2)/=n1)then
3124              fofr(n1+base, idat) = workr(1,n1half1,i2,i3,idat)
3125            end if
3126          end do
3127        end do
3128      end do
3129 
3130    else if (isign==-1) then
3131      do idat=1,ndat
3132        do i3=1,n3
3133          do i2=1,n2
3134            base=n1*(i2-1+n2*(i3-1))
3135            do i1=1,n1half1-1
3136              workr(1,i1,i2,i3,idat)=fofr(2*i1-1+base,idat)
3137              workr(2,i1,i2,i3,idat)=fofr(2*i1  +base,idat)
3138            end do
3139            ! If n1 odd, must add last data
3140            if((2*n1half1-2)/=n1)then
3141              workr(1,n1half1,i2,i3,idat)=fofr(n1+base,idat)
3142              workr(2,n1half1,i2,i3,idat)=zero
3143            end if
3144          end do
3145        end do
3146      end do
3147 
3148      call sg2002_forw(cplex,ndat,n1,n2,n3,n4,n5,n6,n4half1,n5half1,n6,2,workr,workf,comm_fft)
3149 
3150      ! Transfer fft output to the original fft box
3151      do idat=1,ndat
3152        do i3=1,n3
3153 
3154          do i2=1,n2half1
3155            base=n1*(i2-1+n2*(i3-1))
3156            do i1=1,n1
3157              fofg(1,i1+base,idat) = workf(1,i1,i3,i2,idat)*xnorm
3158              fofg(2,i1+base,idat) = workf(2,i1,i3,i2,idat)*xnorm
3159            end do
3160          end do
3161 
3162          ! Complete missing values with complex conjugate
3163          ! Inverse of ix is located at nx+2-ix , except for ix=1, for which it is 1.
3164          if(n2half1>2)then
3165            do i2=2,n2+1-n2half1
3166              base=n1*((n2+2-i2)-1)
3167              if(i3/=1)base=base+n1*n2*((n3+2-i3)-1)
3168              fofg(1,1+base,idat)= workf(1,1,i3,i2,idat)*xnorm
3169              fofg(2,1+base,idat)=-workf(2,1,i3,i2,idat)*xnorm
3170              do i1=2,n1
3171                fofg(1,n1+2-i1+base,idat)= workf(1,i1,i3,i2,idat)*xnorm
3172                fofg(2,n1+2-i1+base,idat)=-workf(2,i1,i3,i2,idat)*xnorm
3173              end do
3174            end do
3175          end if
3176 
3177        end do
3178      end do
3179 
3180    end if ! isign
3181    ABI_FREE(workr)
3182    ABI_FREE(workf)
3183  end if
3184 
3185  ! Here, one calls the complex-to-complex FFT subroutine
3186  if( (fftalgb==0 .or. cplex==2) .and. fftalga/=4 )then
3187    ABI_CHECK(ndat == 1, "ndat must be 1")
3188 
3189    ABI_MALLOC(work1, (2,n4,n5,n6,ndat))
3190    ABI_MALLOC(work2, (2,n4,n5,n6,ndat))
3191 
3192    if (isign==1) then
3193 
3194      ! Transfer fofg to the expanded fft box
3195 !$OMP PARALLEL DO COLLAPSE(2) PRIVATE(base)
3196      do idat=1,ndat
3197        do i3=1,n3
3198          do i2=1,n2
3199            base=n1*(i2-1+n2*(i3-1))
3200            do i1=1,n1
3201              work1(1,i1,i2,i3,idat) = fofg(1,i1+base,idat)
3202              work1(2,i1,i2,i3,idat) = fofg(2,i1+base,idat)
3203            end do
3204          end do
3205        end do
3206      end do
3207 
3208      ! Call Goedecker C2C FFT
3209      !call sg_fft_cc(fftcache,n1,n2,n3,n4,n5,n6,ndat,isign,work1,work2)
3210      call ccfft(ngfft,isign,n1,n2,n3,n4,n5,n6,ndat,2,work1,work2,comm_fft)
3211 
3212      ! Take data from expanded box and put it in the original box.
3213      if (cplex==1) then
3214        ! REAL case
3215 !$OMP PARALLEL DO COLLAPSE(2) PRIVATE(base)
3216        do idat=1,ndat
3217          do i3=1,n3
3218            do i2=1,n2
3219              base=n1*(i2-1+n2*(i3-1))
3220              do i1=1,n1
3221                fofr(i1+base,idat) = work2(1,i1,i2,i3,idat)
3222              end do
3223            end do
3224          end do
3225        end do
3226 
3227      else
3228        ! COMPLEX case
3229 !$OMP PARALLEL DO COLLAPSE(2) PRIVATE(base)
3230        do idat=1,ndat
3231          do i3=1,n3
3232            do i2=1,n2
3233              base=2*n1*(i2-1+n2*(i3-1))
3234              do i1=1,n1
3235                fofr(2*i1-1+base, idat) = work2(1,i1,i2,i3,idat)
3236                fofr(2*i1  +base, idat) = work2(2,i1,i2,i3,idat)
3237              end do
3238            end do
3239          end do
3240        end do
3241      end if
3242 
3243    else if (isign==-1) then
3244 
3245      ! Insert fofr into the augmented fft box
3246      if (cplex==1) then
3247        ! REAL case copy data
3248 !$OMP PARALLEL DO COLLAPSE(2) PRIVATE(base)
3249        do idat=1,ndat
3250          do i3=1,n3
3251            do i2=1,n2
3252              base=n1*(i2-1+n2*(i3-1))
3253              do i1=1,n1
3254                work1(1,i1,i2,i3,idat) = fofr(i1+base,idat)
3255                work1(2,i1,i2,i3,idat) = zero
3256              end do
3257            end do
3258          end do
3259        end do
3260      else
3261        ! COMPLEX case copy data
3262 !$OMP PARALLEL DO COLLAPSE(2) PRIVATE(base)
3263        do idat=1,ndat
3264          do i3=1,n3
3265            do i2=1,n2
3266              base=2*n1*(i2-1+n2*(i3-1))
3267              do i1=1,n1
3268                work1(1,i1,i2,i3, idat) = fofr(2*i1-1+base, idat)
3269                work1(2,i1,i2,i3, idat) = fofr(2*i1  +base, idat)
3270              end do
3271            end do
3272          end do
3273        end do
3274      end if ! cplex
3275 
3276      ! Call Stefan Goedecker C2C FFT
3277      !call sg_fft_cc(fftcache,n1,n2,n3,n4,n5,n6,ndat,isign,work1,work2)
3278      call ccfft(ngfft,isign,n1,n2,n3,n4,n5,n6,ndat,2,work1,work2,comm_fft)
3279 
3280      ! Transfer fft output to the original fft box
3281 !$OMP PARALLEL DO COLLAPSE(2) PRIVATE(base)
3282      do idat=1,ndat
3283        do i3=1,n3
3284          do i2=1,n2
3285            base=n1*(i2-1+n2*(i3-1))
3286            do i1=1,n1
3287              fofg(1,i1+base,idat) = work2(1,i1,i2,i3,idat)*xnorm
3288              fofg(2,i1+base,idat) = work2(2,i1,i2,i3,idat)*xnorm
3289            end do
3290          end do
3291        end do
3292      end do
3293 
3294    end if ! isign
3295 
3296    ABI_FREE(work1)
3297    ABI_FREE(work2)
3298  end if ! End simple algorithm
3299 
3300  ! Here sophisticated algorithm based on S. Goedecker routines, only for the REAL case.
3301  ! Take advantage of the fact that fofr is real, and that fofg has corresponding symmetry properties.
3302  if( (fftalgb==1 .and. cplex==1) .and. fftalga/=4 )then
3303    ABI_CHECK(nproc_fft == 1,"nproc > 1 not supported")
3304    do idat=1,ndat
3305      call sg_fft_rc(cplex,fofg(1,1,idat),fofr(1,idat),isign,nfft,ngfft)
3306    end do
3307  end if
3308 
3309  100 call timab(1260+tim_fourdp,2,tsec)
3310 
3311 end subroutine fourdp

ABINIT/fourwf [ Functions ]

[ Top ] [ Functions ]

NAME

 fourwf

FUNCTION

 Carry out composite Fourier transforms between real and reciprocal (G) space.
 Wavefunctions, contained in a sphere in reciprocal space,
 can be FFT to real space. They can also be FFT from real space
 to a sphere. Also, the density maybe accumulated, and a local
 potential can be applied.

 The different options are :
 - option=0 --> reciprocal to real space and output the result.
 - option=1 --> reciprocal to real space and accumulate the density.
 - option=2 --> reciprocal to real space, apply the local potential to the wavefunction
                in real space and produce the result in reciprocal space.
 - option=3 --> real space to reciprocal space.
                NOTE that in this case, fftalg=1x1 MUST be used. This may be changed in the future.

 The different sections of this routine corresponds to different
 algorithms, used independently of each others :
 - fftalg=xx0 : use simple complex-to-complex routines, without zero padding
     (rather simple, so can be used to understand how fourwf.f works);
 - fftalg=1x1 : use S Goedecker routines, with zero padding
     (7/12 savings in execution time);
 - fftalg=1x2 : call even more sophisticated coding also based on S Goedecker routines

 This routine contains many parts that differ only
 by small details, in order to treat each case with the better speed.
 Also for better speed, it uses no F90 construct, except the allocate command
 and for zeroing arrays.

INPUTS

 cplex= if 1 , denpot is real, if 2 , denpot is complex
    (cplex=2 only allowed for option=2, and istwf_k=1)
    not relevant if option=0 or option=3, so cplex=0 can be used to minimize memory
 fofgin(2,npwin)=holds input wavefunction in G vector basis sphere.
                 (intent(in) but the routine sphere can modify it for another iflag)
 gboundin(2*mgfft+8,2)=sphere boundary info for reciprocal to real space
 gboundout(2*mgfft+8,2)=sphere boundary info for real to reciprocal space
 istwf_k=option parameter that describes the storage of wfs
 kg_kin(3,npwin)=reduced planewave coordinates, input
 kg_kout(3,npwout)=reduced planewave coordinates, output
 mgfft=maximum size of 1D FFTs
 mpi_enreg=information about MPI parallelization
 ndat=number of FFT to do in //
 ngfft(18)=contain all needed information about 3D FFT, see ~abinit/doc/variables/vargs.htm#ngfft
 npwin=number of elements in fofgin array (for option 0, 1 and 2)
 npwout=number of elements in fofgout array (for option 2 and 3)
 n4,n5,n6=ngfft(4),ngfft(5),ngfft(6), dimensions of fofr.
 option= if 0: do direct FFT
         if 1: do direct FFT, then sum the density
         if 2: do direct FFT, multiply by the potential, then do reverse FFT
         if 3: do reverse FFT only
 tim_fourwf=timing code of the calling routine (can be set to 0 if not attributed)
 weight_r=weight to be used for the accumulation of the density in real space
         (needed only when option=1)
 weight_i=weight to be used for the accumulation of the density in real space
         (needed only when option=1 and (fftalg=4 and fftalgc/=0))
 [weight_array_r]= -- optional -- same as weight_r when ndat>1
                   weight_array_r(i)=weight_r to be used for band i
                   at present only used for the GPU version
 [weight_array_i]= -- optional -- same as weight_i when ndat>1
                   weight_array_i(i)=weight_i to be used for band i
                   at present only used for the GPU version
 [fofginb(2,npwin)]=holds second input wavefunction in G vector basis sphere.
                 (intent(in) but the routine sphere can modify it for another iflag)
                 (for non diagonal occupation)
 [use_ndo] = use non diagonal occupations.
 [gpu_option] = GPU implementation to use, i.e. cuda, openMP, ... (0=not using GPU)

OUTPUT

  (see side effects)

SIDE EFFECTS

 Input/Output
 for option==0, fofgin(2,npwin*ndat)=holds input wavefunction in G sphere;
                fofr(2,n4,n5,n6*ndat) contains the output Fourier Transform of fofgin;
                no use of denpot, fofgout and npwout.
 for option==1, fofgin(2,npwin*ndat)=holds input wavefunction in G sphere;
                denpot(cplex*n4,n5,n6) contains the input density at input,
                and the updated density at output (accumulated);
                no use of fofgout and npwout.
 for option==2, fofgin(2,npwin*ndat)=holds input wavefunction in G sphere;
                denpot(cplex*n4,n5,n6) contains the input local potential;
                fofgout(2,npwout*ndat) contains the output function;
 for option==3, fofr(2,n4,n5,n6*ndat) contains the input real space wavefunction;
                fofgout(2,npwout*ndat) contains its output Fourier transform;
                no use of fofgin and npwin.

NOTES

   DO NOT CHANGE THE API OF THIS FUNCTION.
   If you need a specialized routine for the FFT of the wavefunctions, create
   a wrapper that uses fourwf to accomplish your task. This routine, indeed,
   has already too many parameters and each change in the API requires a careful
   modification of the different wrappers used for specialized FFTs such as FFTW3 and MKL-DFTI

SOURCE

2288 subroutine fourwf(cplex,denpot,fofgin,fofgout,fofr,gboundin,gboundout,istwf_k,&
2289                   kg_kin,kg_kout,mgfft,mpi_enreg,ndat,ngfft,npwin,npwout,n4,n5,n6,option,&
2290                   tim_fourwf,weight_r,weight_i, &
2291                   weight_array_r,weight_array_i,gpu_option,use_ndo,fofginb) ! Optional arguments
2292 
2293 !Arguments ------------------------------------
2294 !scalars
2295  integer,intent(in) :: cplex,istwf_k,mgfft,n4,n5,n6,ndat,npwin,npwout,option
2296  integer,intent(in) :: tim_fourwf
2297  integer,intent(in),optional :: gpu_option,use_ndo
2298  real(dp),intent(in) :: weight_r,weight_i
2299  real(dp),intent(in),optional,target :: weight_array_r(ndat),weight_array_i(ndat)
2300  type(MPI_type),intent(in) :: mpi_enreg
2301 !arrays
2302  integer,intent(in) :: gboundin(2*mgfft+8,2),gboundout(2*mgfft+8,2)
2303  integer,intent(in) :: kg_kin(3,npwin),kg_kout(3,npwout),ngfft(18)
2304  real(dp),intent(inout) :: denpot(cplex*n4,n5,n6),fofgin(2,npwin*ndat)
2305  real(dp),intent(inout),optional :: fofginb(:,:) ! (2,npwin*ndat)
2306  real(dp),intent(inout) :: fofr(2,n4,n5,n6*ndat)
2307  real(dp),intent(out) :: fofgout(2,npwout*ndat)
2308 
2309 !Local variables-------------------------------
2310 !scalars
2311  integer :: fftalg,fftalga,fftalgc,fftcache,i1,i2,i2_local,i3,i3_local,i3_glob,idat,ier
2312  integer :: iflag,ig,comm_fft,me_g0,me_fft,n1,n2,n3,nd2proc,nd3proc
2313  integer :: nfftot,nproc_fft,option_ccfft,paral_kgb,gpu_option_
2314  real(dp) :: fim,fre,xnorm
2315  character(len=500) :: msg
2316  logical :: luse_ndo
2317 !arrays
2318  integer,parameter :: shiftg0(3)=0
2319  integer,parameter :: symmE(3,3)=reshape([1,0,0,0,1,0,0,0,1],[3,3])
2320  integer, ABI_CONTIGUOUS pointer :: fftn2_distrib(:),ffti2_local(:)
2321  integer, ABI_CONTIGUOUS pointer :: fftn3_distrib(:),ffti3_local(:)
2322  real(dp) :: tsec(2)
2323  real(dp),allocatable :: work1(:,:,:,:),work2(:,:,:,:),work3(:,:,:,:)
2324  real(dp),allocatable :: work4(:,:,:,:),work_sum(:,:,:,:)
2325  real(dp),pointer :: weight_ptr_r(:),weight_ptr_i(:)
2326 
2327 ! *************************************************************************
2328 
2329  ! Accumulate timing
2330  call timab(840+tim_fourwf,1,tsec)
2331 
2332  if (fourwf_counter>=0) then
2333    fourwf_counter = fourwf_counter + ndat
2334    if (option==2) fourwf_counter = fourwf_counter + ndat
2335  end if
2336 
2337  n1=ngfft(1); n2=ngfft(2); n3=ngfft(3); nfftot=n1*n2*n3
2338  fftcache=ngfft(8)
2339  fftalg=ngfft(7); fftalga=fftalg/100; fftalgc=mod(fftalg,10)
2340  me_fft=ngfft(11)
2341  nproc_fft=ngfft(10)
2342 
2343  comm_fft = mpi_enreg%comm_fft; me_g0 = mpi_enreg%me_g0_fft
2344  paral_kgb = mpi_enreg%paral_kgb
2345 
2346  !if (ndat/=1) then
2347  !  write(std_out,*)fftalg
2348  !  ABI_ERROR("Really? I thought nobody uses ndat > 1")
2349  !end if
2350 
2351  !if (weight_r /= weight_i) then
2352  !  write(std_out,*)fftalg
2353  !  ABI_ERROR("Really? I thought nobody uses weight_r != weight_i")
2354  !end if
2355 
2356  !if (option == 0 .and. fftalgc == 0) then
2357  !  ABI_ERROR("Option 0 is buggy when fftalgc ==0 is used!")
2358  !end if
2359 
2360 !GPU version of fourwf
2361  gpu_option_=ABI_GPU_DISABLED
2362  if (PRESENT(gpu_option)) gpu_option_=gpu_option
2363 
2364  if(gpu_option_/=ABI_GPU_DISABLED) then
2365    if (present(weight_array_r)) then
2366      weight_ptr_r => weight_array_r
2367    else
2368      ABI_MALLOC(weight_ptr_r,(ndat))
2369      weight_ptr_r(:)=weight_r
2370    end if
2371    if (present(weight_array_i)) then
2372      weight_ptr_i => weight_array_i
2373    else
2374      ABI_MALLOC(weight_ptr_i,(ndat))
2375      weight_ptr_i(:)=weight_i
2376    end if
2377    if(gpu_option_==ABI_GPU_LEGACY) then
2378 #if defined HAVE_GPU_CUDA
2379      call gpu_fourwf(cplex,denpot,fofgin,fofgout,fofr,gboundin,gboundout,istwf_k,&
2380        kg_kin,kg_kout,mgfft,mpi_enreg,ndat,ngfft,npwin,npwout,n4,n5,n6,option,&
2381        paral_kgb,tim_fourwf,weight_ptr_r,weight_ptr_i) !,use_ndo,fofginb)
2382 #endif
2383    else if(gpu_option_==ABI_GPU_KOKKOS) then
2384 #if defined HAVE_GPU_CUDA && defined HAVE_YAKL
2385      call gpu_fourwf_managed(cplex,denpot,fofgin,fofgout,fofr,gboundin,gboundout,istwf_k,&
2386        kg_kin,kg_kout,mgfft,mpi_enreg,ndat,ngfft,npwin,npwout,n4,n5,n6,option,&
2387        paral_kgb,tim_fourwf,weight_ptr_r,weight_ptr_i) !,use_ndo,fofginb)
2388 #endif
2389    else if(gpu_option_==ABI_GPU_OPENMP) then
2390 #ifdef HAVE_OPENMP_OFFLOAD
2391      call ompgpu_fourwf(cplex,denpot,fofgin,fofgout,fofr,gboundin,gboundout,istwf_k,&
2392        kg_kin,kg_kout,mgfft,ndat,ngfft,npwin,npwout,n4,n5,n6,option,&
2393        weight_ptr_r,weight_ptr_i) !,use_ndo,fofginb)
2394 #endif
2395    end if
2396    if (.not.present(weight_array_r)) then
2397      ABI_FREE(weight_ptr_r)
2398    end if
2399    if (.not.present(weight_array_i)) then
2400      ABI_FREE(weight_ptr_i)
2401    end if
2402    call timab(840+tim_fourwf,2,tsec)
2403    return
2404  end if ! GPU
2405 
2406  if ((fftalgc < 0 .or. fftalgc > 2)) then
2407    write(msg, '(a,i0,5a)' )&
2408     'The input algorithm number fftalg= ',fftalg,' is not allowed.',ch10,&
2409     'The third digit, fftalg(C), must be 0, 1, or 2',ch10,&
2410     'Action: change fftalg in your input file.'
2411    ABI_ERROR(msg)
2412  end if
2413 
2414  if (fftalgc /= 0 .and. ALL(fftalga /= [1,3,4,5])) then
2415    write(msg, '(a,i0,5a)' )&
2416     'The input algorithm number fftalg= ',fftalg,' is not allowed.',ch10,&
2417     'The first digit must be 1,3,4 when the last digit is not 0.',ch10,&
2418     'Action: change fftalg in your input file.'
2419    ABI_ERROR(msg)
2420  end if
2421 
2422  if (option < 0 .or. option > 3)then
2423    write(msg, '(a,i0,3a)' )&
2424     'The option number ',option,' is not allowed.',ch10,&
2425     'Only option=0, 1, 2 or 3 are allowed presently.'
2426    ABI_ERROR(msg)
2427  end if
2428 
2429  if (option == 1 .and. cplex /= 1) then
2430    write(msg, '(3a,i0,a)' )&
2431     'With the option number 1, cplex must be 1,',ch10,&
2432     'but it is cplex= ',cplex,'.'
2433    ABI_ERROR(msg)
2434  end if
2435 
2436  if (option==2 .and. (cplex/=1 .and. cplex/=2)) then
2437    write(msg, '(3a,i0,a)' )&
2438     'With the option number 2, cplex must be 1 or 2,',ch10,&
2439     'but it is cplex= ',cplex,'.'
2440    ABI_ERROR(msg)
2441  end if
2442 
2443  ! DMFT uses its own FFT algorithm (that should be wrapped in a different routine!)
2444  luse_ndo=.false.
2445  if (present(use_ndo).and.present(fofginb)) then
2446    if(use_ndo==1) then
2447      luse_ndo=.true.
2448      if((size(fofginb,2)==0)) then
2449        write(msg, '(a,a,a,i4,i5)' )&
2450         'fofginb has a dimension equal to zero and use_ndo==1',ch10,&
2451         'Action: check dimension of fofginb',size(fofginb,2),use_ndo
2452        ABI_ERROR(msg)
2453      end if
2454    end if
2455  end if
2456 
2457  if (luse_ndo) then
2458    if (.not.(fftalgc==2 .and. option/=3)) then
2459      ABI_ERROR("luse_ndo but not .not.(fftalgc==2 .and. option/=3)")
2460    end if
2461    ABI_CHECK(nproc_fft==1, "DMFT with nproc_fft != 1")
2462    ABI_CHECK(ndat == 1, "use_ndo and ndat != 1 not coded")
2463 
2464    call sg_fftrisc_2(cplex,denpot,fofgin,fofgout,fofr,gboundin,gboundout,&
2465      istwf_k,kg_kin,kg_kout,&
2466      mgfft,ngfft,npwin,npwout,n4,n5,n6,option,weight_r,weight_2=weight_i,luse_ndo=luse_ndo,fofgin_p=fofginb)
2467    goto 100
2468  end if
2469 
2470  ! Get the distrib associated with this fft_grid => for i2 and i3 planes
2471  call ptabs_fourwf(mpi_enreg,n2,n3,fftn2_distrib,ffti2_local,fftn3_distrib,ffti3_local)
2472 
2473  ! Branch immediately depending on nproc_fft
2474  if (nproc_fft > 1 .and. fftalg /= 412) then
2475    call fourwf_mpi(cplex,denpot,fofgin,fofgout,fofr,gboundin,gboundout,&
2476      istwf_k,kg_kin,kg_kout,me_g0,mgfft,ngfft,mpi_enreg%distribfft,n1,n2,n3,npwin,npwout,&
2477      n4,n5,n6,ndat,option,weight_r,weight_i,comm_fft)
2478    goto 100
2479  end if
2480 
2481  select case (fftalga)
2482 
2483  case (FFT_FFTW3)
2484    if (luse_ndo) ABI_ERROR("luse_ndo not supported by FFTW3")
2485    if (nproc_fft == 1) then
2486      ! call wrtout(std_out, "FFTW3_SEQFOURWF")
2487      call fftw3_seqfourwf(cplex,denpot,fofgin,fofgout,fofr,gboundin,gboundout,istwf_k,&
2488        kg_kin,kg_kout,mgfft,ndat,ngfft,npwin,npwout,n4,n5,n6,option,weight_r,weight_i)
2489    else
2490      ABI_ERROR("Not coded")
2491    end if
2492 
2493  case (FFT_DFTI)
2494    if (luse_ndo) ABI_ERROR("luse_ndo not supported by DFTI")
2495    if (nproc_fft == 1) then
2496      ! call wrtout(std_out, "DFTI_SEQFOURWF")
2497      call dfti_seqfourwf(cplex,denpot,fofgin,fofgout,fofr,gboundin,gboundout,istwf_k,&
2498        kg_kin,kg_kout,mgfft,ndat,ngfft,npwin,npwout,n4,n5,n6,option,weight_r,weight_i)
2499    else
2500      ABI_ERROR("Not coded")
2501    end if
2502 
2503  case default
2504    ! TODO: Clean this code!
2505 
2506    ! Here, use routines that make forwards FFT separately of backwards FFT,
2507    ! in particular, usual 3DFFT library routines, called in ccfft.
2508    if (fftalgc==0 .or. (fftalgc==1 .and. fftalga/=4) .or. &
2509       (fftalgc==2 .and. fftalga/=4 .and. option==3) )then
2510 
2511      ABI_MALLOC(work1,(2,n4,n5,n6*ndat))
2512 
2513      if (option/=3)then
2514        ! Insert fofgin into the fft box (array fofr)
2515 
2516        if (fftalga/=4)then
2517          iflag=1
2518          call sphere(fofgin,ndat,npwin,fofr,n1,n2,n3,n4,n5,n6,kg_kin,istwf_k,iflag,me_g0,shiftg0,symmE,one)
2519 
2520        else if (fftalga==4 .and. fftalgc==0) then
2521          ! Note the switch of n5 and n6, as they are only
2522          ! needed to dimension work2 inside "sphere"
2523          ABI_MALLOC(work2,(2,n4,n6,n5*ndat))
2524 
2525          iflag=2
2526          nd2proc=((n2-1)/nproc_fft) +1
2527          nd3proc=((n6-1)/nproc_fft) +1
2528          ABI_MALLOC(work3,(2,n4,n6,nd2proc*ndat))
2529          ABI_MALLOC(work4,(2,n4,n5,nd3proc*ndat))
2530 
2531          if (istwf_k == 1 .and. paral_kgb==1) then
2532            ! sphere dont need a big array
2533            work3=zero
2534            call sphere_fft(fofgin,ndat,npwin,work3,n1,n2,n3,n4,n6,kg_kin,&
2535              mpi_enreg%distribfft%tab_fftwf2_local,nd2proc)
2536          else
2537            ! sphere needs a big array and communications
2538            if (nproc_fft == 1 .and. ndat == 1 .and. istwf_k == 1) then
2539              ! dimensions of tab work3 and work2 are identical no need to use work2
2540              work3=zero
2541              call sphere(fofgin,ndat,npwin,work3,n1,n2,n3,n4,n6,nd2proc,&
2542                kg_kin,istwf_k,iflag,me_g0,shiftg0,symmE,one)
2543            else
2544              work2=zero
2545              call sphere(fofgin,ndat,npwin,work2,n1,n2,n3,n4,n6,n5,&
2546                kg_kin,istwf_k,iflag,me_g0,shiftg0,symmE,one)
2547 
2548              if (paral_kgb==1 .and. istwf_k > 1) then
2549                ! Collect G-vectors on each node
2550                work3=zero
2551                ABI_MALLOC(work_sum,(2,n4,n6,n5*ndat))
2552                call timab(48,1,tsec)
2553                call xmpi_sum(work2,work_sum,2*n4*n6*n5*ndat,comm_fft,ier)
2554                call timab(48,2,tsec)
2555 
2556                ! Extract my list of G-vectors needed for MPI-FFT.
2557                do idat=1,ndat
2558                  do i2=1,n2
2559                    if( fftn2_distrib(i2) == me_fft) then
2560                      i2_local = ffti2_local(i2) + nd2proc*(idat-1)
2561                      do i3=1,n3
2562                        do i1=1,n1
2563                          work3(1,i1,i3,i2_local)=work_sum(1,i1,i3,i2+n5*(idat-1))
2564                          work3(2,i1,i3,i2_local)=work_sum(2,i1,i3,i2+n5*(idat-1))
2565                        end do
2566                      end do
2567                    end if
2568                  end do
2569                end do
2570                ABI_FREE(work_sum)
2571              end if
2572 
2573              if (paral_kgb/=1) then
2574                do idat=1,ndat
2575                  do i2=1,n2
2576                    do i3=1,n3
2577                      do i1=1,n1
2578                        work3(1,i1,i3,i2+nd2proc*(idat-1))=work2(1,i1,i3,i2+n5*(idat-1))
2579                        work3(2,i1,i3,i2+nd2proc*(idat-1))=work2(2,i1,i3,i2+n5*(idat-1))
2580                      end do
2581                    end do
2582                  end do
2583                end do
2584              end if
2585            end if
2586          end if
2587          if (paral_kgb==1) then
2588            option_ccfft=1
2589          else
2590            option_ccfft=2
2591          end if
2592        end if
2593 
2594        ! Fourier transform fofr (reciprocal to real space)
2595        ! The output might be in work1 or fofr, depending on inplace
2596        if (fftalgc==0) then
2597          if (fftalga/=4) then
2598            ! Call usual 3DFFT library routines
2599            call ccfft(ngfft,+1,n1,n2,n3,n4,n5,n6,ndat,2,fofr,work1,comm_fft)
2600          else
2601            ! SG simplest complex-to-complex routine
2602            call ccfft(ngfft,+1,n1,n2,n3,n4,n5,n6,ndat,option_ccfft,work3,work4,comm_fft)
2603            ABI_FREE(work2)
2604            ABI_FREE(work3)
2605          end if
2606        else
2607          ! Call SG routine, with zero padding
2608          call sg_fftpad(fftcache,mgfft,n1,n2,n3,n4,n5,n6,ndat,gboundin,+1,fofr,work1)
2609        end if
2610      end if ! option/=3
2611 
2612      ! Note that if option==0 everything is alright already, the output is available in fofr.
2613      ! MG: TODO: Rewrite this mess in human-readable form!
2614      if (option==0) then
2615        if (fftalgc==0) then
2616          if (fftalga/=4) then
2617            call DCOPY(2*n4*n5*n6*ndat,work1,1,fofr,1)
2618          else
2619            call DCOPY(2*n4*n5*n6*ndat,work4,1,fofr,1)
2620          end if
2621        else
2622          ! Results are copied to fofr.
2623          call DCOPY(2*n4*n5*n6*ndat,work1,1,fofr,1)
2624        end if
2625      end if
2626 
2627      if (option==1) then
2628        ! Accumulate density
2629        if ((fftalgc==0) .and. (fftalga==4)) then
2630          do idat=1,ndat
2631            do i3=1,n3
2632              if( me_fft == fftn3_distrib(i3) ) then
2633                i3_local = ffti3_local(i3) + nd3proc*(idat-1)
2634                do i2=1,n2
2635                  do i1=1,n1
2636                    denpot(i1,i2,i3)=denpot(i1,i2,i3)+&
2637                      weight_r*work4(1,i1,i2,i3_local)**2+&
2638                      weight_i*work4(2,i1,i2,i3_local)**2
2639                  end do
2640                end do
2641              end if
2642            end do
2643          end do ! idat
2644        else
2645          call cg_addtorho(n1,n2,n3,n4,n5,n6,ndat,weight_r,weight_i,work1,denpot)
2646        end if
2647      end if ! option==1
2648 
2649      if (option==2) then
2650        ! Apply local potential
2651        if (cplex==1) then
2652 
2653          if ((fftalgc==0) .and. (fftalga==4)) then
2654 !$OMP PARALLEL DO PRIVATE(i3_local,i3_glob)
2655            do idat=1,ndat
2656              do i3=1,n3
2657                if( me_fft == fftn3_distrib(i3) ) then
2658                  i3_local = ffti3_local(i3) + nd3proc*(idat-1)
2659                  i3_glob = i3+n3*(idat-1)
2660                  do i2=1,n2
2661                    do i1=1,n1
2662                      fofr(1,i1,i2,i3_glob)= denpot(i1,i2,i3)*work4(1,i1,i2,i3_local)
2663                      fofr(2,i1,i2,i3_glob)= denpot(i1,i2,i3)*work4(2,i1,i2,i3_local)
2664                    end do
2665                  end do
2666                end if
2667              end do
2668            end do
2669          end if
2670          if ((fftalgc/=0) .or. (fftalga/=4)) then
2671 !$OMP PARALLEL DO PRIVATE(i3_glob)
2672            do idat=1,ndat
2673              do i3=1,n3
2674                if( me_fft == fftn3_distrib(i3) ) then
2675                  i3_glob = i3+n3*(idat-1)
2676                  do i2=1,n2
2677                    do i1=1,n1
2678                      fofr(1,i1,i2,i3_glob)=denpot(i1,i2,i3)*work1(1,i1,i2,i3+n3*(idat-1))
2679                      fofr(2,i1,i2,i3_glob)=denpot(i1,i2,i3)*work1(2,i1,i2,i3+n3*(idat-1))
2680                    end do
2681                  end do
2682                end if
2683              end do
2684            end do
2685          end if
2686 
2687        else if (cplex==2) then
2688          if ((fftalgc==0) .and. (fftalga==4)) then
2689 !$OMP PARALLEL DO PRIVATE(fre,fim,i3_local,i3_glob)
2690            do idat=1,ndat
2691              do i3=1,n3
2692                if( me_fft == fftn3_distrib(i3) ) then
2693                  i3_local = ffti3_local(i3) + nd3proc*(idat-1)
2694                  i3_glob = i3+n3*(idat-1)
2695                  do i2=1,n2
2696                    do i1=1,n1
2697                      fre=work4(1,i1,i2,i3_local)
2698                      fim=work4(2,i1,i2,i3_local)
2699                      fofr(1,i1,i2,i3_glob)=denpot(2*i1-1,i2,i3)*fre -denpot(2*i1,i2,i3)*fim
2700                      fofr(2,i1,i2,i3_glob)=denpot(2*i1-1,i2,i3)*fim +denpot(2*i1,i2,i3)*fre
2701                    end do
2702                  end do
2703                end if
2704              end do
2705            end do
2706          end if
2707 
2708          if ((fftalgc/=0) .or. (fftalga/=4)) then
2709 !$OMP PARALLEL DO PRIVATE(fre,fim,i3_glob)
2710            do idat=1,ndat
2711              do i3=1,n3
2712                if( me_fft == fftn3_distrib(i3) ) then
2713                  i3_glob = i3+n3*(idat-1)
2714                  do i2=1,n2
2715                    do i1=1,n1
2716                      fre=work1(1,i1,i2,i3+n3*(idat-1))
2717                      fim=work1(2,i1,i2,i3+n3*(idat-1))
2718                      fofr(1,i1,i2,i3_glob)=denpot(2*i1-1,i2,i3)*fre -denpot(2*i1,i2,i3)*fim
2719                      fofr(2,i1,i2,i3_glob)=denpot(2*i1-1,i2,i3)*fim +denpot(2*i1,i2,i3)*fre
2720                    end do
2721                  end do
2722                end if
2723              end do
2724            end do
2725          end if
2726        end if ! cplex=2
2727 
2728      end if ! option=2
2729 
2730      ! The data for option==2 or option==3 is now in fofr.
2731      if (option==2 .or. option==3) then
2732 
2733        if (fftalgc==0) then
2734          ! Call usual 3DFFT library routines or SG simplest complex-to-complex routine
2735          if (fftalga==FFT_SG2002) then
2736            ABI_FREE(work1)
2737            ABI_MALLOC(work1,(2,n4,n6,n5*ndat))
2738          end if
2739 
2740          if (option==3 .or. fftalga/=4) then
2741            call ccfft(ngfft,-1,n1,n2,n3,n4,n5,n6,ndat,2,fofr,work1,comm_fft)
2742          else
2743            ! creation of small arrays
2744            ! nd3proc=((n5-1)/nproc_fft) +1
2745            nd3proc=((n6-1)/nproc_fft) +1
2746            nd2proc=((n2-1)/nproc_fft) +1
2747            ABI_MALLOC(work3,(2,n4,n5,nd3proc*ndat))
2748            ABI_MALLOC(work2,(2,n4,n6,nd2proc*ndat))
2749 
2750            if (paral_kgb==1) then
2751 
2752              if (cplex==1) then
2753                do idat=1,ndat
2754                  do i3=1,n3
2755                    if( me_fft == fftn3_distrib(i3) ) then
2756                      i3_local = ffti3_local(i3) + nd3proc*(idat-1)
2757                      do i2=1,n2
2758                        do i1=1,n1
2759                          work3(1,i1,i2,i3_local)=denpot(i1,i2,i3)*work4(1,i1,i2,i3_local)
2760                          work3(2,i1,i2,i3_local)=denpot(i1,i2,i3)*work4(2,i1,i2,i3_local)
2761                        end do
2762                      end do
2763                    end if
2764                  end do
2765                end do
2766              else
2767                do idat=1,ndat
2768                  do i3=1,n3
2769                    if( me_fft == fftn3_distrib(i3) ) then
2770                      i3_local = ffti3_local(i3) + nd3proc*(idat-1)
2771                      do i2=1,n2
2772                        do i1=1,n1
2773                          fre=work4(1,i1,i2,i3_local)
2774                          fim=work4(2,i1,i2,i3_local)
2775                          work3(1,i1,i2,i3_local) = denpot(2*i1-1,i2,i3)*fre-denpot(2*i1,i2,i3)*fim
2776                          work3(2,i1,i2,i3_local) = denpot(2*i1-1,i2,i3)*fim+denpot(2*i1,i2,i3)*fre
2777                        end do
2778                      end do
2779                    end if
2780                  end do
2781                end do
2782              end if
2783              option_ccfft=1
2784 
2785            else
2786              if (nproc_fft /=1 .or. ndat /= 1 ) then
2787                do idat=1,ndat
2788                  do i3=1,n3
2789                    do i2=1,n2
2790                      do i1=1,n1
2791                        work3(1,i1,i2,i3+nd3proc*(idat-1))=fofr(1,i1,i2,i3+n3*(idat-1))
2792                        work3(2,i1,i2,i3+nd3proc*(idat-1))=fofr(2,i1,i2,i3+n3*(idat-1))
2793                      end do
2794                    end do
2795                  end do
2796                end do
2797                option_ccfft=2
2798              end if
2799            end if
2800 
2801            if (paral_kgb==1) then
2802              call ccfft(ngfft,-1,n1,n2,n3,n4,n5,n6,ndat,option_ccfft,work3,work2,comm_fft)
2803            else
2804              if (nproc_fft /=1 .or. ndat /= 1 ) then
2805                call ccfft(ngfft,-1,n1,n2,n3,n4,n5,n6,ndat,option_ccfft,work3,work2,comm_fft)
2806              else
2807                call ccfft(ngfft,-1,n1,n2,n3,n4,n5,n6,ndat,option_ccfft,fofr,work1,comm_fft)
2808              end if
2809            end if
2810 
2811            ! load of work1
2812            if ((paral_kgb==1) .and.  ( istwf_k > 1 )) work1(:,:,:,:)=zero
2813 
2814            if (paral_kgb==1) then
2815              if ( istwf_k > 1 ) then
2816                do idat=1,ndat
2817                  do i2=1,n2
2818                    if( me_fft == fftn2_distrib(i2) ) then
2819                      i2_local = ffti2_local(i2) + nd2proc*(idat-1)
2820                      do i3=1,n3
2821                        do i1=1,n1
2822                          work1(1,i1,i3,i2+n5*(idat-1))= work2(1,i1,i3,i2_local)
2823                          work1(2,i1,i3,i2+n5*(idat-1))= work2(2,i1,i3,i2_local)
2824                        end do
2825                      end do
2826                    end if
2827                  end do
2828                end do
2829              end if
2830 
2831            else
2832              if (nproc_fft /=1 .or. ndat /= 1 ) then
2833                do idat=1,ndat
2834                  do i2=1,n2
2835                    do i3=1,n3
2836                      do i1=1,n1
2837                        work1(1,i1,i3,i2+n5*(idat-1))=work2(1,i1,i3,i2+nd2proc*(idat-1))
2838                        work1(2,i1,i3,i2+n5*(idat-1))=work2(2,i1,i3,i2+nd2proc*(idat-1))
2839                      end do
2840                    end do
2841                  end do
2842                end do
2843              end if
2844            end if
2845            ABI_FREE(work3)
2846            if ((paral_kgb==1) .and.  ( istwf_k > 1 )) then
2847              call timab(48,1,tsec)
2848              call xmpi_sum(work1,comm_fft,ier)
2849              call timab(48,2,tsec)
2850            end if
2851          end if
2852 
2853        else
2854          ! Call SG routine, with zero padding
2855          call sg_fftpad(fftcache,mgfft,n1,n2,n3,n4,n5,n6,ndat,gboundout,-1,fofr,work1)
2856        end if
2857 
2858        xnorm = one/dble(nfftot)
2859 
2860        if (fftalga/=4) then
2861          call cg_box2gsph(n1,n2,n3,n4,n5,n6,ndat,npwout,kg_kout,work1,fofgout, rscal=xnorm)
2862        else
2863          ! if fftalga==4
2864          if ((paral_kgb==1) .and. ( istwf_k == 1 )) then
2865 !$OMP PARALLEL DO PRIVATE(i1,i2,i3,i2_local)
2866            do idat=1,ndat
2867              do ig=1,npwout
2868                i1=kg_kout(1,ig); if(i1<0)i1=i1+n1 ; i1=i1+1
2869                i2=kg_kout(2,ig); if(i2<0)i2=i2+n2 ; i2=i2+1
2870                i3=kg_kout(3,ig); if(i3<0)i3=i3+n3 ; i3=i3+1
2871                i2_local = ffti2_local(i2) + nd2proc*(idat-1)
2872                fofgout(1,ig+npwout*(idat-1))= work2(1,i1,i3,i2_local)*xnorm
2873                fofgout(2,ig+npwout*(idat-1))= work2(2,i1,i3,i2_local)*xnorm
2874              end do
2875            end do
2876            ABI_FREE(work2)
2877          else
2878 !$OMP PARALLEL DO PRIVATE(i1,i2,i3)
2879            do idat=1,ndat
2880              do ig=1,npwout
2881                i1=kg_kout(1,ig); if(i1<0)i1=i1+n1; i1=i1+1
2882                i2=kg_kout(2,ig); if(i2<0)i2=i2+n2; i2=i2+1
2883                i3=kg_kout(3,ig); if(i3<0)i3=i3+n3; i3=i3+1
2884                fofgout(1,ig+npwout*(idat-1))=work1(1,i1,i3,i2+n5*(idat-1))*xnorm
2885                fofgout(2,ig+npwout*(idat-1))=work1(2,i1,i3,i2+n5*(idat-1))*xnorm
2886              end do
2887            end do
2888          end if
2889        end if ! fftalga
2890      end if ! if option==2 or 3
2891 
2892      ABI_SFREE(work1)
2893    end if
2894 
2895    ! Here, call more specialized 3-dimensional fft
2896    ! (zero padding as well as maximize cache reuse) based on S Goedecker routines.
2897    ! Specially tuned for cache architectures.
2898    if (fftalga==FFT_SG .and. fftalgc==2 .and. option/=3) then
2899      call sg_fftrisc(cplex,denpot,fofgin,fofgout,fofr,gboundin,gboundout,&
2900        istwf_k,kg_kin,kg_kout,mgfft,ndat,ngfft,npwin,npwout,n4,n5,n6,option,weight_r,weight_i)
2901    end if
2902 
2903    ! Here, call new FFT from S Goedecker, also sophisticated specialized 3-dimensional fft
2904    ! (zero padding as well as maximize cache reuse)
2905    if (fftalga==FFT_SG2002 .and. fftalgc/=0) then
2906      ! The args are not the same as fourwf, but might be
2907      call fourwf_mpi(cplex,denpot,fofgin,fofgout,fofr,gboundin,gboundout,&
2908        istwf_k,kg_kin,kg_kout,me_g0,mgfft,ngfft,mpi_enreg%distribfft,n1,n2,n3,npwin,npwout,&
2909        n4,n5,n6,ndat,option,weight_r,weight_i,comm_fft)
2910    end if
2911 
2912    if (option==0.and.(fftalga==FFT_SG.or.fftalga==FFT_SG2002)) then
2913      ! In these cases, add the periodic image of the borders so all fofr components are computed
2914      do i3=1,n3
2915        if (n1==n4-1) then
2916          do i2=1,n2
2917            fofr(:,n4,i2,i3)=fofr(:,1,i2,i3)
2918          end do
2919        end if
2920        if (n2==n5-1) then
2921          fofr(:,:,n5,i3)=fofr(:,:,1,i3)
2922        end if
2923      end do
2924    end if
2925 
2926    ABI_SFREE(work4)
2927    ABI_SFREE(work2)
2928  end select
2929 
2930 ! Accumulate timing
2931  100 continue
2932  call timab(840+tim_fourwf,2,tsec)
2933 
2934 end subroutine fourwf

ABINIT/m_fft [ Modules ]

[ Top ] [ Modules ]

NAME

  m_fft

FUNCTION

  This module provides driver routines for sequential FFTs (OpenMP threads are supported).
  It also defines generic interfaces for single or double precision FFTs.

COPYRIGHT

 Copyright (C) 2009-2024 ABINIT group (MG, MM, GZ, MT, MF, XG, PT, FF)
 This file is distributed under the terms of the
 GNU General Public License, see ~abinit/COPYING
 or http://www.gnu.org/copyleft/gpl.txt .

SOURCE

 17 #if defined HAVE_CONFIG_H
 18 #include "config.h"
 19 #endif
 20 
 21 #include "abi_common.h"
 22 
 23 MODULE m_fft
 24 
 25  use defs_basis
 26  use m_abicore
 27  use m_errors
 28  use m_xomp
 29  use m_xmpi
 30  use m_cplxtools
 31  use m_cgtools
 32  use m_sgfft
 33  use m_sg2002
 34  use m_fftw3
 35  use m_dfti
 36 #if defined HAVE_MPI2
 37  use mpi
 38 #endif
 39 
 40  use defs_abitypes,   only : MPI_type
 41  use defs_fftdata,    only : mg
 42  use m_time,          only : cwtime, timab
 43  use m_numeric_tools, only : r2c
 44  use m_fstrings,      only : sjoin, itoa
 45  use m_geometry,      only : metric
 46  use m_hide_blas,     only : xscal
 47  use m_fftcore,       only : get_cache_kb, kpgsph, get_kg, sphere_fft, sphere_fft1, sphere, change_istwfk, &
 48                              fftalg_info, fftalg_has_mpi, print_ngfft, getng, sphereboundary
 49  use m_mpinfo,        only : destroy_mpi_enreg, ptabs_fourdp, ptabs_fourwf, initmpi_seq
 50  use m_distribfft,    only : distribfft_type, init_distribfft, destroy_distribfft
 51 
 52 #if defined HAVE_GPU_CUDA
 53  ! MG: Had to comment this line to avoid "Ambiguous reference to c_ptr on buda2 with CUDA
 54  use m_manage_cuda
 55 #endif
 56  use m_ompgpu_fourwf
 57 
 58  use, intrinsic :: iso_c_binding
 59 
 60  implicit none
 61 
 62  private
 63 
 64 #if defined HAVE_MPI1
 65  include 'mpif.h'
 66 #endif
 67 
 68  public :: fft_ug               ! Driver for zero-padded FFTs u(g) --> u(r)
 69  public :: fft_ur               ! Driver for zero-padded FFTs u(r) --> u(g)
 70  public :: fftpad               ! Driver for (low-level) zero-padded FFTs, note that fft_ug is the preferred interface.
 71  public :: fft_poisson          ! Solve the poisson equation in G-space starting from n(r).
 72  public :: fourdp_6d            ! Calculate a 6-dimensional Fast Fourier Transform
 73  public :: fftpac               ! Copy to change the stride of a three-dimension array for more efficient FFT.
 74  public :: indirect_parallel_Fourier
 75 
 76  public :: fft_use_lib_threads
 77  public :: fft_allow_ialltoall  ! Allow the use of non-blocking IALLOTOALL in MPI-FFTs algorithms
 78  public :: zerosym              ! Symmetrize an array on the FFT grid by vanishing some term on the boundaries.
 79 
 80 ! Main entry points.
 81  public :: fourdp
 82  public :: fourwf
 83 
 84  integer,public,save :: fourdp_counter = -1
 85  integer,public,save :: fourwf_counter = -1
 86  public :: fft_init_counters
 87  public :: fft_stop_counters
 88  public :: fft_output_counters
 89 
 90 ! Driver routines for MPI version.
 91  public :: fourdp_mpi           ! MPI FFT of densities/potentials on the full box.
 92  public :: fourwf_mpi           ! specialized MPI-FFT for wavefunctions.
 93  !public :: fftmpi_u
 94 
 95  interface fft_ug
 96    module procedure fft_ug_sp
 97    module procedure fft_ug_dp
 98    module procedure fft_ug_spc
 99    module procedure fft_ug_dpc
100  end interface fft_ug
101 
102  interface fft_ur
103    module procedure fft_ur_dp
104    module procedure fft_ur_spc
105    module procedure fft_ur_dpc
106  end interface fft_ur
107 
108  interface fftpad
109    module procedure fftpad_spc
110    module procedure fftpad_dpc
111  end interface fftpad

m_fft/fft_allow_ialltoall [ Functions ]

[ Top ] [ m_fft ] [ Functions ]

NAME

  fft_allow_ialltoall

FUNCTION

  Allow the use of non-blocking IALLOTOALL in MPI-FFTs algorithms
  Mainly used for profiling purposes.

SOURCE

271 subroutine fft_allow_ialltoall(bool)
272 
273 !Arguments ------------------------------------
274  logical,intent(in) :: bool
275 
276 ! *************************************************************************
277 
278  ALLOW_IALLTOALL = bool
279 #ifndef HAVE_MPI_IALLTOALL
280  ALLOW_IALLTOALL = .False.
281 #endif
282 
283 end subroutine fft_allow_ialltoall

m_fft/fft_poisson [ Functions ]

[ Top ] [ m_fft ] [ Functions ]

NAME

 fft_poisson

FUNCTION

  Driver routine to solve the Poisson equation in G-space given the density, n(r),
  in real space of the FFT box.

INPUTS

 ngfft(18)=Info on the 3D FFT.
 cplex=1 if fofr is real, 2 if fofr is complex
 nx,ny,nz=Number of FFT points along the three directions.
 ldx,ldy,ldz=Leading dimension of the array nr and vg.
 ndat = Number of densities
 vg(nx*ny*nz)=Potential in reciprocal space.

SIDE EFFECTS

 nr(cplex*ldx*ldy*ldz*ndat)
    input: n(r) (real or complex)
    output: the hartree potential in real space

NOTES

   vg is given on the FFT mesh instead of the augmented mesh [ldx,ldy,ldz]
   in order to simplify the interface with the other routines operating of vg

SOURCE

1111 subroutine fft_poisson(ngfft, cplex, nx, ny, nz, ldx, ldy, ldz, ndat, vg, nr)
1112 
1113 !Arguments ------------------------------------
1114 !scalars
1115  integer,intent(in) :: cplex,nx,ny,nz,ldx,ldy,ldz,ndat
1116  integer,intent(in) :: ngfft(18)
1117 !arrays
1118  real(dp),intent(inout) :: nr(cplex*ldx*ldy*ldz*ndat)
1119  real(dp),intent(in) :: vg(nx*ny*nz)
1120 
1121 !Local variables-------------------------------
1122  integer :: fftalga, fftcache
1123 
1124 ! *************************************************************************
1125 
1126  fftalga = ngfft(7)/100; fftcache = ngfft(8)
1127 
1128  select case (fftalga)
1129  case (FFT_SG, FFT_SG2002)
1130    ! Note: according to my tests fftalg 1xx is always faster than 4xx in sequential
1131    ! hence it does not make sense to provide a fallback for fftalg 4xx.
1132    ! Use external FFT libraries if you want to run at top-level speed.
1133    call sg_poisson(fftcache,cplex,nx,ny,nz,ldx,ldy,ldz,ndat,vg,nr)
1134 
1135  case (FFT_FFTW3)
1136    call fftw3_poisson(cplex,nx,ny,nz,ldx,ldy,ldz,ndat,vg,nr)
1137 
1138  !case (FFT_DFTI)
1139  !  call dfti_poisson(cplex,nx,ny,nz,ldx,ldy,ldz,ndat,vg,nr)
1140 
1141  case default
1142    ABI_BUG(sjoin("Wrong value for fftalga: ",itoa(fftalga)))
1143  end select
1144 
1145 end subroutine fft_poisson

m_fft/fft_ug_dp [ Functions ]

[ Top ] [ m_fft ] [ Functions ]

NAME

 fft_ug_dp

FUNCTION

 Driver routine for G-->R transform of wavefunctions with zero-padded FFT.
 TARGET: double precision real arrays with Re/Im.

INPUTS

  See fft_ug_dpc

SOURCE

647 subroutine fft_ug_dp(npw_k, nfft, nspinor, ndat, mgfft, ngfft, istwf_k, kg_k, gbound_k, ug, ur)
648 
649 !Arguments ------------------------------------
650 !scalars
651  integer,intent(in) :: npw_k,nfft,nspinor,istwf_k,mgfft,ndat
652 !arrays
653  integer,intent(in) :: ngfft(18),gbound_k(2*mgfft+8,2),kg_k(3,npw_k)
654  real(dp),target,intent(in) :: ug(*)  !2*npw_k*nspinor*ndat)
655  real(dp),target,intent(out) :: ur(*) !2*nfft*nspinor*ndat)
656 
657 !Local variables-------------------------------
658  complex(dp),contiguous,pointer :: ug_cplx(:), ur_cplx(:)
659 
660 ! *************************************************************************
661 
662  call C_F_pointer(c_loc(ug), ug_cplx, shape=[npw_k*nspinor*ndat])
663  call C_F_pointer(c_loc(ur), ur_cplx, shape=[nfft*nspinor*ndat])
664 
665  call fft_ug_dpc(npw_k, nfft, nspinor, ndat, mgfft, ngfft, istwf_k, kg_k, gbound_k, ug_cplx, ur_cplx)
666 
667 end subroutine fft_ug_dp

m_fft/fft_ug_dpc [ Functions ]

[ Top ] [ m_fft ] [ Functions ]

NAME

 fft_ug_dpc

FUNCTION

 Driver routine for G-->R transform of wavefunctions with zero-padded FFT.
 TARGET: double precision arrays

INPUTS

 npw_k=number of plane waves for this k-point.
 nfft=Number of FFT points.
 nspinor=number of spinorial components
 ndat=Numer of wavefunctions to transform.
 mgfft=Max number of FFT divisions
 ngfft(18)=information about 3D FFT, see ~abinit/doc/variables/vargs.htm#ngfft
 istwfk=Option describing the storage of the wavefunction. (at present must be 1)
 kg_k(3,npw_k)=G-vectors in reduced coordinates
 gbound_k_k(2*mgfft+8,2)=Table for padded-FFT. See sphereboundary.
 ug(npw_k*nspinor*ndat)=wavefunctions in reciprocal space

OUTPUT

  ur(nfft*nspinor*ndat)=wavefunctions in real space.

SOURCE

741 subroutine fft_ug_dpc(npw_k, nfft, nspinor, ndat, mgfft, ngfft, istwf_k, kg_k, gbound_k, ug, ur)
742 
743 !Arguments ------------------------------------
744 !scalars
745  integer,intent(in) :: npw_k,nfft,nspinor,istwf_k,mgfft,ndat
746 !arrays
747  integer,intent(in) :: ngfft(18),gbound_k(2*mgfft+8,2),kg_k(3,npw_k)
748  complex(dpc),intent(in) :: ug(*)  !npw_k*nspinor*ndat)
749  complex(dpc),intent(out) :: ur(*) !nfft*nspinor*ndat)
750 
751 ! *************************************************************************
752 
753 #include "fftug_driver.finc"
754 
755 end subroutine fft_ug_dpc

m_fft/fft_ug_sp [ Functions ]

[ Top ] [ m_fft ] [ Functions ]

NAME

 fft_ug_sp

FUNCTION

 Driver routine for G-->R transform of wavefunctions with zero-padded FFT.
 TARGET: single precision real arrays with Re/Im.

INPUTS

  See fft_ug_dpc

SOURCE

611 subroutine fft_ug_sp(npw_k, nfft, nspinor, ndat, mgfft, ngfft, istwf_k, kg_k, gbound_k, ug, ur)
612 
613 !Arguments ------------------------------------
614 !scalars
615  integer,intent(in) :: npw_k,nfft,nspinor,istwf_k,mgfft,ndat
616 !arrays
617  integer,intent(in) :: ngfft(18),gbound_k(2*mgfft+8,2),kg_k(3,npw_k)
618  real(sp),target,intent(in) :: ug(2*npw_k*nspinor*ndat)
619  real(sp),target,intent(out) :: ur(2*nfft*nspinor*ndat)
620 
621 !Local variables-------------------------------
622  complex(sp),contiguous,pointer :: ug_cplx(:), ur_cplx(:)
623 
624 ! *************************************************************************
625 
626  call C_F_pointer(c_loc(ug), ug_cplx, shape=[npw_k*nspinor*ndat])
627  call C_F_pointer(c_loc(ur), ur_cplx, shape=[nfft*nspinor*ndat])
628 
629  call fft_ug_spc(npw_k, nfft, nspinor, ndat, mgfft, ngfft, istwf_k, kg_k, gbound_k, ug_cplx, ur_cplx)
630 
631 end subroutine fft_ug_sp

m_fft/fft_ug_spc [ Functions ]

[ Top ] [ m_fft ] [ Functions ]

NAME

 fft_ug_spc

FUNCTION

 Driver routine for G-->R transform of wavefunctions with zero-padded FFT.
 TARGET: single precision arrays

INPUTS

 npw_k=number of plane waves for this k-point.
 nfft=Number of FFT points.
 nspinor=number of spinorial components
 ndat=Numer of wavefunctions to transform.
 mgfft=Max number of FFT divisions
 ngfft(18)=information about 3D FFT, see ~abinit/doc/variables/vargs.htm#ngfft
 istwfk=Option describing the storage of the wavefunction. (at present must be 1)
 kg_k(3,npw_k)=G-vectors in reduced coordinates
 gbound_k_k(2*mgfft+8,2)=Table for padded-FFT. See sphereboundary.
 ug(npw_k*nspinor*ndat)=wavefunctions in reciprocal space

OUTPUT

  ur(nfft*nspinor*ndat)=wavefunctions in real space.

SOURCE

697 subroutine fft_ug_spc(npw_k, nfft, nspinor, ndat, mgfft, ngfft, istwf_k, kg_k, gbound_k, ug, ur)
698 
699 !Arguments ------------------------------------
700 !scalars
701  integer,intent(in) :: npw_k,nfft,nspinor,istwf_k,mgfft,ndat
702 !arrays
703  integer,intent(in) :: ngfft(18),gbound_k(2*mgfft+8,2),kg_k(3,npw_k)
704  complex(spc),intent(in) :: ug(*)  !npw_k*nspinor*ndat)
705  complex(spc),intent(out) :: ur(*) !nfft*nspinor*ndat)
706 
707 ! *************************************************************************
708 
709 #include "fftug_driver.finc"
710 
711 end subroutine fft_ug_spc

m_fft/fft_ur_dp [ Functions ]

[ Top ] [ m_fft ] [ Functions ]

NAME

 fft_ur_dp

FUNCTION

 Compute ndat zero-padded FFTs from R- to G-space .
 Mainly used for the transform of wavefunctions.
 TARGET: double precision real arrays with re/im

INPUTS

  See fft_ur_dpc

SOURCE

772 subroutine fft_ur_dp(npw_k, nfft, nspinor, ndat, mgfft, ngfft, istwf_k, kg_k, gbound_k, ur, ug)
773 
774 !Arguments ------------------------------------
775 !scalars
776  integer,intent(in) :: npw_k,nfft,nspinor,ndat,istwf_k,mgfft
777 !arrays
778  integer,intent(in) :: ngfft(18),gbound_k(2*mgfft+8,2), kg_k(3,npw_k)
779  real(dp),target,intent(inout) :: ur(*) !2*nfft*nspinor*ndat)
780  real(dp),target,intent(out) :: ug(*)   !2*npw_k*nspinor*ndat)
781 
782 !Local variables-------------------------------
783  complex(dp),contiguous,pointer :: ug_cplx(:), ur_cplx(:)
784 
785 ! *************************************************************************
786 
787  call C_F_pointer(c_loc(ug), ug_cplx, shape=[npw_k*nspinor*ndat])
788  call C_F_pointer(c_loc(ur), ur_cplx, shape=[nfft*nspinor*ndat])
789  call fft_ur_dpc(npw_k, nfft, nspinor, ndat, mgfft, ngfft, istwf_k, kg_k, gbound_k, ur_cplx, ug_cplx)
790 
791 end subroutine fft_ur_dp

m_fft/fft_ur_dpc [ Functions ]

[ Top ] [ m_fft ] [ Functions ]

NAME

 fft_ur_dpc

FUNCTION

 Compute ndat zero-padded FFTs from R- to G-space .
 Mainly used for the transform of wavefunctions.
 TARGET: dpc complex arrays

INPUTS

 npw_k=number of plane waves for this k-point.
 nfft=Number of FFT points.
 nspinor=number of spinorial components
 ndat=Number of wavefunctions to transform.
 mgfft=Max number of FFT divisions
 ngfft(18)=contain all needed information about 3D FFT, see ~abinit/doc/variables/vargs.htm#ngfft
 istwfk=Option describing the storage of the wavefunction. (at present must be 1)
 kg_k(3,npw_k)=G-vectors in reduced coordinates
 gbound_k(2*mgfft+8,2)=Table for padded-FFT. See sphereboundary.

SIDE EFFECTS

  ur(nfft*nspinor*ndat)=In input: wavefunctions in real space
                        Destroyed in output. Do not use ur anymore!

OUTPUT

  ug(npw_k*nspinor*ndat)=wavefunctions in reciprocal space given on the G-sphere.

SOURCE

873 subroutine fft_ur_dpc(npw_k, nfft, nspinor, ndat, mgfft, ngfft, istwf_k, kg_k, gbound_k, ur, ug)
874 
875 !Arguments ------------------------------------
876 !scalars
877  integer,intent(in) :: npw_k,nfft,nspinor,ndat,istwf_k,mgfft
878 !arrays
879  integer,intent(in) :: ngfft(18),gbound_k(2*mgfft+8,2),kg_k(3,npw_k)
880  complex(dpc),intent(inout) :: ur(*) ! nfft*nspinor*ndat)
881  complex(dpc),intent(out) :: ug(*)   ! npw_k*nspinor*ndat)
882 
883 ! *************************************************************************
884 
885 #include "fftur_driver.finc"
886 
887 end subroutine fft_ur_dpc

m_fft/fft_ur_spc [ Functions ]

[ Top ] [ m_fft ] [ Functions ]

NAME

 fft_ur_spc

FUNCTION

 Compute ndat zero-padded FFTs from R- to G-space .
 Mainly used for the transform of wavefunctions.
 TARGET: spc complex arrays

INPUTS

 npw_k=number of plane waves for this k-point.
 nfft=Number of FFT points.
 nspinor=number of spinorial components
 ndat=Number of wavefunctions to transform.
 mgfft=Max number of FFT divisions
 ngfft(18)=contain all needed information about 3D FFT, see ~abinit/doc/variables/vargs.htm#ngfft
 istwfk=Option describing the storage of the wavefunction. (at present must be 1)
 kg_k(3,npw_k)=G-vectors in reduced coordinates
 gbound_k(2*mgfft+8,2)=Table for padded-FFT. See sphereboundary.

SIDE EFFECTS

  ur(nfft*nspinor*ndat)=In input: wavefunctions in real space
                        Destroyed in output. Do not use ur anymore!

OUTPUT

  ug(npw_k*nspinor*ndat)=wavefunctions in reciprocal space given on the G-sphere.

SOURCE

825 subroutine fft_ur_spc(npw_k, nfft, nspinor, ndat, mgfft, ngfft, istwf_k, kg_k, gbound_k, ur, ug)
826 
827 !Arguments ------------------------------------
828 !scalars
829  integer,intent(in) :: npw_k,nfft,nspinor,ndat,istwf_k,mgfft
830 !arrays
831  integer,intent(in) :: ngfft(18),gbound_k(2*mgfft+8,2),kg_k(3,npw_k)
832  complex(spc),intent(inout) :: ur(*) !nfft*nspinor*ndat)
833  complex(spc),intent(out) :: ug(*)   !npw_k*nspinor*ndat)
834 
835 ! *************************************************************************
836 
837 #include "fftur_driver.finc"
838 
839 end subroutine fft_ur_spc

m_fft/fft_use_lib_threads [ Functions ]

[ Top ] [ m_fft ] [ Functions ]

NAME

 fft_use_lib_threads

FUNCTION

INPUTS

OUTPUT

SOURCE

1162 subroutine fft_use_lib_threads(logvar)
1163 
1164 !Arguments ------------------------------------
1165 !scalars
1166  logical,intent(in) :: logvar
1167 
1168 ! *************************************************************************
1169 
1170  call dfti_use_lib_threads(logvar)
1171  call fftw3_use_lib_threads(logvar)
1172 
1173 end subroutine fft_use_lib_threads

m_fft/fftbox_execute_ip_dpc [ Functions ]

[ Top ] [ m_fft ] [ Functions ]

NAME

  fftbox_execute_ip_dpc

FUNCTION

  In-place FFT transform of complex arrays
  Call (FFTW3|DFTI) routines if available, otherwise fallback to SG routines
  TARGET: dpc arrays

INPUTS

  plan<fftbox_plan3_t>=Structure with the parameters defining the transform.
  isign= Sign of the exponential in the FFT
  [iscale]= 0 if G --> R FFT should not be scaled. Default: 1 i.e. scale

SIDE EFFECTS

  ff(plan%ldxyz*plan%batch_size) =
    In input: the data to transform.
    Changed in output, filled with the FFT results.

SOURCE

460 subroutine fftbox_execute_ip_dpc(plan, ff, isign, ndat, iscale)
461 
462 !Arguments ------------------------------------
463 !scalars
464  class(fftbox_plan3_t),target,intent(inout) :: plan
465  integer,intent(in) :: isign
466  integer,optional,intent(in) :: ndat, iscale
467 !arrays
468  complex(dpc),target,intent(inout) :: ff(*)
469 
470 ! *************************************************************************
471 
472  integer :: ndat__, iscale__
473  ndat__ = plan%batch_size; if (present(ndat) ) ndat__ = ndat
474  ABI_DEFAULT(iscale__, iscale, 1)
475 
476 #if defined HAVE_GPU_CUDA
477  if (plan%gpu_option /= ABI_GPU_DISABLED) then
478    call xgpu_fftbox_c2c_ip(plan%dims, plan%embed, ndat__, isign, dpc, iscale__, c_loc(ff), &
479                            plan%gpu_plan_ip_dpc, plan%gpu_data_ip_dpc)
480    return
481  end if
482 #endif
483 
484  ! CPU version
485 #include "fftbox_ip_driver.finc"
486 
487 end subroutine fftbox_execute_ip_dpc

m_fft/fftbox_execute_ip_spc [ Functions ]

[ Top ] [ m_fft ] [ Functions ]

NAME

  fftbox_execute_ip_spc

FUNCTION

  In-place FFT transform of complex array.
  Call (FFTW3|DFTI) routines if available, otherwise fallback to SG routines
  TARGET: spc arrays

INPUTS

  plan<fftbox_plan3_t>=Structure with the parameters defining the transform.
  isign= Sign of the exponential in the FFT
  [iscale]= 0 if G --> R FFT should not be scaled. Default: 1 i.e. scale

SIDE EFFECTS

  ff(plan%ldxyz*plan%batch_size) =
    In input: the data to transform.
    Changed in output, filled with the FFT results.

SOURCE

407 subroutine fftbox_execute_ip_spc(plan, ff, isign, ndat, iscale)
408 
409 !Arguments ------------------------------------
410 !scalars
411  class(fftbox_plan3_t),target,intent(inout) :: plan
412  integer,intent(in) :: isign
413  integer,optional,intent(in) :: ndat, iscale
414 !arrays
415  complex(spc),target,intent(inout) :: ff(*)
416 
417 ! *************************************************************************
418 
419  integer :: ndat__, iscale__
420  ndat__ = plan%batch_size; if (present(ndat) ) ndat__ = ndat
421  ABI_DEFAULT(iscale__, iscale, 1)
422 
423 #if defined HAVE_GPU_CUDA
424  if (plan%gpu_option /= ABI_GPU_DISABLED) then
425    call xgpu_fftbox_c2c_ip(plan%dims, plan%embed, ndat__, isign, spc, iscale__, c_loc(ff), &
426                            plan%gpu_plan_ip_spc, plan%gpu_data_ip_spc)
427    return
428  end if
429 #endif
430 
431  ! CPU version
432 #include "fftbox_ip_driver.finc"
433 
434 end subroutine fftbox_execute_ip_spc

m_fft/fftbox_execute_op_dpc [ Functions ]

[ Top ] [ m_fft ] [ Functions ]

NAME

  fftbox_execute_op_dpc

FUNCTION

  Out-of-place FFT transform of complex arrays.
  Call (FFTW3|DFTI) routines if available, otherwise fallback to SG routines
  TARGET: dpc arrays

INPUTS

 plan<fftbox_plan3_t>=Structure with the parameters defining the transform.
 ff(plan%ldxyz*plan%batch_size)=The input array to be transformed.
 isign= Sign of the exponential in the FFT
 [iscale]= 0 if G --> R FFT should not be scaled. Default: 1 i.e. scale

OUTPUT

  gg(plan%ldxyz*plan%batch_size)= The FFT results.

SOURCE

565 subroutine fftbox_execute_op_dpc(plan, ff, gg, isign, ndat, iscale)
566 
567 !Arguments ------------------------------------
568 !scalars
569  class(fftbox_plan3_t),intent(inout) :: plan
570  integer,intent(in) :: isign
571  integer,optional,intent(in) :: ndat, iscale
572 !arrays
573  complex(dpc),target,intent(in) :: ff(*)
574  complex(dpc),target,intent(inout) :: gg(*)
575 
576 ! *************************************************************************
577 
578  integer :: ndat__, iscale__
579  ndat__ = plan%batch_size; if (present(ndat) ) ndat__ = ndat
580  ABI_DEFAULT(iscale__, iscale, 1)
581 
582 #if defined HAVE_GPU_CUDA
583  if (plan%gpu_option /= ABI_GPU_DISABLED) then
584    call xgpu_fftbox_c2c_op(plan%dims, plan%embed, ndat__, isign, dpc, iscale__, c_loc(ff), c_loc(gg), &
585                            plan%gpu_plan_op_dpc, plan%gpu_idata_op_dpc, plan%gpu_odata_op_dpc)
586    return
587  end if
588 #endif
589 
590  ! CPU version
591 #include "fftbox_op_driver.finc"
592 
593 end subroutine fftbox_execute_op_dpc

m_fft/fftbox_execute_op_spc [ Functions ]

[ Top ] [ m_fft ] [ Functions ]

NAME

  fftbox_execute_op_spc

FUNCTION

  Out-of-place FFT transform of complex arrays.
  Call (FFTW3|DFTI) routines if available, otherwise fallback to SG routines
  TARGET: spc arrays

INPUTS

 plan<fftbox_plan3_t>=Structure with the parameters defining the transform.
 ff(plan%ldxyz*plan%batch_size)=The input array to be transformed.
 isign= Sign of the exponential in the FFT
 [iscale]= 0 if G --> R FFT should not be scaled. Default: 1 i.e. scale

OUTPUT

  gg(plan%ldxyz*plan%batch_size)= The FFT results.

SOURCE

512 subroutine fftbox_execute_op_spc(plan, ff, gg, isign, ndat, iscale)
513 
514 !Arguments ------------------------------------
515 !scalars
516  class(fftbox_plan3_t),intent(inout) :: plan
517  integer,intent(in) :: isign
518  integer,optional,intent(in) :: ndat, iscale
519 !arrays
520  complex(spc),target,intent(in) :: ff(*)
521  complex(spc),target,intent(inout) :: gg(*)
522 
523 ! *************************************************************************
524 
525  integer :: ndat__, iscale__
526  ndat__ = plan%batch_size; if (present(ndat) ) ndat__ = ndat
527  ABI_DEFAULT(iscale__, iscale, 1)
528 
529 #if defined HAVE_GPU_CUDA
530  if (plan%gpu_option /= ABI_GPU_DISABLED) then
531    call xgpu_fftbox_c2c_op(plan%dims, plan%embed, ndat__, isign, spc, iscale__, c_loc(ff), c_loc(gg), &
532                            plan%gpu_plan_op_spc, plan%gpu_idata_op_spc, plan%gpu_odata_op_spc)
533    return
534  end if
535 #endif
536 
537  ! CPU version
538 #include "fftbox_op_driver.finc"
539 
540 end subroutine fftbox_execute_op_spc

m_fft/fftbox_mpi_utests [ Functions ]

[ Top ] [ m_fft ] [ Functions ]

NAME

  fftbox_mpi_utests

FUNCTION

 Driver routine for unit tests of the MPI FFT routines.

INPUTS

 fftalg =fftalg input variable.
 cplex=1 for r2c, 2 for c2c transforms.
 ndat = Number of transform to execute
 nthreads = Number of OpenMP threads.
 comm_fft=MPI communicator for the FFT
 [unit]=Output Unit number (DEFAULT std_out)

OUTPUT

  nfailed=number of failures.

SOURCE

1674 function fftbox_mpi_utests(fftalg, cplex, ndat, nthreads, comm_fft, unit) result(nfailed)
1675 
1676 !Arguments -----------------------------------
1677 !scalars
1678  integer,intent(in) :: fftalg,cplex,ndat,nthreads,comm_fft
1679  integer,optional,intent(in) :: unit
1680  integer :: nfailed
1681 
1682 !Local variables-------------------------------
1683 !scalars
1684  integer,parameter :: NSETS=6
1685  integer :: ierr,old_nthreads,ount,iset,mpierr,nfft,me_fft
1686  integer :: nproc_fft,fftalga,fftalgc,n1,n2,n3,n4,n5,n6
1687  real(dp),parameter :: ATOL_DP=tol12
1688  real(dp) :: max_abserr
1689  real(dp) ::  ctime,wtime,gflops
1690  character(len=500) :: msg,info,library,cplex_mode,padding_mode
1691  type(distribfft_type),target :: fftabs
1692 !arrays
1693  integer :: pars(6,NSETS),ngfft(18)
1694  integer, ABI_CONTIGUOUS pointer :: fftn2_distrib(:),ffti2_local(:)
1695  integer, ABI_CONTIGUOUS pointer :: fftn3_distrib(:),ffti3_local(:)
1696  real(dp),allocatable :: fofg(:,:),fofr(:),fofr_copy(:)
1697 
1698 ! *************************************************************************
1699 
1700  ount = std_out; if (PRESENT(unit)) ount = unit
1701  nfailed = 0
1702 
1703  if (nthreads>0) then
1704    old_nthreads = xomp_get_max_threads()
1705    call xomp_set_num_threads(nthreads)
1706  end if
1707 
1708  ! These values must be compatible with all the FFT routines.
1709  ! SG library is the most restrictive (only powers of 2,3,5).
1710  pars = RESHAPE( [    &
1711    12, 18, 15, 12, 18, 15, &
1712    12, 18, 15, 13, 19, 16, &
1713    12, 18, 15, 13, 19, 15, &
1714    12, 18, 15, 12, 18, 16, &
1715    12, 18, 15, 13, 18, 15, &
1716    12, 18, 15, 15, 21, 18  &
1717   ], [6, NSETS] )
1718 
1719  fftalga=fftalg/100; fftalgc=mod(fftalg,10)
1720 
1721  nproc_fft = xmpi_comm_size(comm_fft); me_fft = xmpi_comm_rank(comm_fft)
1722 
1723  call fftalg_info(fftalg,library,cplex_mode,padding_mode)
1724 
1725  !do iset=1,SIZE(pars,DIM=2)
1726  do iset=1,1
1727    n1=pars(1,iset); n2=pars(2,iset); n3=pars(3,iset)
1728    ! For the time being, ngfft(2) and ngfft(3) must be multiple of nproc_fft
1729    n2 = n2 * nproc_fft; n3 = n3 * nproc_fft
1730    !n4=pars(4,iset); n5=pars(5,iset); n6=pars(6,iset)
1731    n4=n1; n5=n2; n6=n3
1732 
1733    ! Init ngfft
1734    ! TODO Propagate more info via ngfft, define helper functions, write routine to get fftcache
1735    ngfft = 0
1736    ngfft(1:7) = [n1,n2,n3,n4,n5,n6,fftalg]
1737    ngfft(8)= get_cache_kb()        ! cache
1738    ngfft(9)=1                      ! paral_fft_
1739    ngfft(10)=nproc_fft             ! nproc_fft
1740    ngfft(11)=xmpi_comm_rank(comm_fft)  ! me_fft
1741    ngfft(12)=ngfft(2)/nproc_fft    ! n2proc
1742    ngfft(13)=ngfft(3)/nproc_fft    ! n3proc
1743 
1744    !call print_ngfft(ngfft,"ngfft for MPI-fourdp",unit=std_out,mode_paral="COLL",prtvol=0)
1745 
1746    ! Allocate arrays, fill fofr with random numbers and keep a copy.
1747    nfft = (n1 * n2 * n3) / nproc_fft
1748    ABI_MALLOC(fofg, (2,nfft*ndat))
1749    ABI_MALLOC(fofr, (cplex*nfft*ndat))
1750    ABI_MALLOC(fofr_copy, (cplex*nfft*ndat))
1751 
1752    call RANDOM_NUMBER(fofr)
1753    fofr_copy = fofr
1754 
1755    call init_distribfft(fftabs,"c",nproc_fft,n2,n3)
1756    fftn2_distrib => fftabs%tab_fftdp2_distrib
1757    ffti2_local => fftabs%tab_fftdp2_local
1758    fftn3_distrib => fftabs%tab_fftdp3_distrib
1759    ffti3_local => fftabs%tab_fftdp3_local
1760 
1761    call cwtime(ctime,wtime,gflops,"start")
1762 
1763    select case (fftalga)
1764 
1765    case (FFT_SG2002)
1766      call sg2002_mpifourdp(cplex,nfft,ngfft,ndat,-1,fftn2_distrib,ffti2_local,fftn3_distrib,ffti3_local,fofg,fofr,comm_fft)
1767      call sg2002_mpifourdp(cplex,nfft,ngfft,ndat,+1,fftn2_distrib,ffti2_local,fftn3_distrib,ffti3_local,fofg,fofr,comm_fft)
1768 
1769    case (FFT_FFTW3)
1770      call fftw3_mpifourdp(cplex,nfft,ngfft,ndat,-1,fftn2_distrib,ffti2_local,fftn3_distrib,ffti3_local,fofg,fofr,comm_fft)
1771      call fftw3_mpifourdp(cplex,nfft,ngfft,ndat,+1,fftn2_distrib,ffti2_local,fftn3_distrib,ffti3_local,fofg,fofr,comm_fft)
1772 
1773    !case (FFT_DFTI)
1774    !  call dfti_mpifourdp(cplex,nfft,ngfft,ndat,-1,fftn2_distrib,ffti2_local,fftn3_distrib,ffti3_local,fofg,fofr,comm_fft)
1775    !  call dfti_mpifourdp(cplex,nfft,ngfft,ndat,+1,fftn2_distrib,ffti2_local,fftn3_distrib,ffti3_local,fofg,fofr,comm_fft)
1776 
1777    case default
1778      ABI_BUG(sjoin("fftalg: ", itoa(fftalg), " does not support MPI-FFT"))
1779    end select
1780 
1781    call cwtime(ctime,wtime,gflops,"stop")
1782    if (me_fft == 0) then
1783      !write(std_out,'(a,i0,2(a,f10.4))')"fftalg: ",fftalg,", cpu_time: ",ctime,", wall_time: ",wtime
1784    end if
1785 
1786    ! Check if F^{-1} F = I within ATOL_DP
1787    ierr = COUNT(ABS(fofr - fofr_copy) > ATOL_DP)
1788    call xmpi_sum(ierr,comm_fft,mpierr)
1789    nfailed = nfailed + ierr
1790 
1791    if (cplex == 1) info = sjoin(library,"r2c --> c2r :")
1792    if (cplex == 2) info = sjoin(library,"c2c :")
1793 
1794    if (ierr /= 0) then
1795      ! Compute the maximum of the absolute error.
1796      max_abserr = MAXVAL(ABS(fofr - fofr_copy))
1797      call xmpi_max(max_abserr,comm_fft,mpierr)
1798      write(msg,"(a,es9.2,a)")" FAILED (max_abserr = ",max_abserr,")"
1799    else
1800      write(msg,"(a)")" OK"
1801    end if
1802    call wrtout(ount,sjoin(info, msg))
1803 
1804    call destroy_distribfft(fftabs)
1805 
1806    ABI_FREE(fofg)
1807    ABI_FREE(fofr_copy)
1808    ABI_FREE(fofr)
1809  end do
1810 
1811  if (nthreads > 0) call xomp_set_num_threads(old_nthreads)
1812 
1813 end function fftbox_mpi_utests

m_fft/fftbox_plan3_free [ Functions ]

[ Top ] [ m_fft ] [ Functions ]

NAME

  fftbox_plan3_free

FUNCTION

  Free dynamic memory.

SOURCE

359 subroutine fftbox_plan3_free(plan)
360 
361 !Arguments ------------------------------------
362  class(fftbox_plan3_t),target,intent(inout) :: plan
363 
364 ! *************************************************************************
365 
366  ABI_UNUSED(plan%ldxyz)
367 
368 #if defined HAVE_GPU_CUDA
369  call gpu_planpp_free(plan%gpu_plan_ip_spc)
370  call devpp_free(plan%gpu_data_ip_spc)
371  call gpu_planpp_free(plan%gpu_plan_ip_dpc)
372  call devpp_free(plan%gpu_data_ip_dpc)
373  call gpu_planpp_free(plan%gpu_plan_op_spc)
374  call devpp_free(plan%gpu_idata_op_spc)
375  call devpp_free(plan%gpu_odata_op_spc)
376  call gpu_planpp_free(plan%gpu_plan_op_dpc)
377  call devpp_free(plan%gpu_idata_op_dpc)
378  call devpp_free(plan%gpu_odata_op_dpc)
379 #endif
380 
381 end subroutine fftbox_plan3_free

m_fft/fftbox_plan3_from_ngfft [ Functions ]

[ Top ] [ m_fft ] [ Functions ]

NAME

  fftbox_plan3_from_ngfft

FUNCTION

  Initialize plan from ngfft.

SOURCE

335 subroutine fftbox_plan3_from_ngfft(plan, ngfft, batch_size, gpu_option)
336 
337 !Arguments ------------------------------------
338  class(fftbox_plan3_t),intent(out) :: plan
339  integer,intent(in) :: ngfft(18), batch_size, gpu_option
340 
341 ! *************************************************************************
342 
343  call plan%init(batch_size, ngfft(1:3), ngfft(4:6), ngfft(7), ngfft(8), gpu_option)
344 
345 end subroutine fftbox_plan3_from_ngfft

m_fft/fftbox_plan3_init [ Functions ]

[ Top ] [ m_fft ] [ Functions ]

NAME

  fftbox_plan3_init

FUNCTION

  Initialize the plan with the options passed to the fttbox_ routines.
  Low-level constructor.

INPUTS

  See fftbox_plan3_t for the meaning of the different arguments.

SOURCE

301 subroutine fftbox_plan3_init(plan, batch_size, dims, embed, fftalg, fftcache, gpu_option)
302 
303 !Arguments ------------------------------------
304 !scalars
305  class(fftbox_plan3_t),intent(out) :: plan
306  integer,intent(in) :: batch_size, fftalg, fftcache, gpu_option
307 !arrays
308  integer,intent(in) :: dims(3), embed(3)
309 
310 ! *************************************************************************
311 
312  plan%batch_size = batch_size
313  plan%dims     = dims                       ! ngfft(1:3)
314  plan%embed    = embed                      ! ngfft(4:6)
315  plan%fftalg   = fftalg                     ! ngfft(7)
316  if (fftcache > 0) plan%fftcache = fftcache ! ngfft(8)
317  plan%gpu_option  = gpu_option
318  plan%nfft  = product(plan%dims)
319  plan%ldxyz = product(plan%embed)
320 
321 end subroutine fftbox_plan3_init

m_fft/fftbox_plan3_t [ Types ]

[ Top ] [ m_fft ] [ Types ]

NAME

 fftbox_plan3_t

FUNCTION

  Options passed to the fftbox_* routines to perform dense 3d FFTs.

SOURCE

125  type,public :: fftbox_plan3_t
126 
127    integer :: fftalg = 112       ! The library to call.
128    integer :: fftcache = 16      ! Cache size in kB. Only used in SG routines.
129    integer :: nfft = -1          ! Total number of points in the FFT box.
130    integer :: ldxyz = -1         ! Physical dimension of the array to transform
131    integer :: batch_size = -1    ! MAXIMUM number of FFTs associated to the plan.
132    integer :: dims(3) = -1       ! The number of FFT divisions.
133    integer :: embed(3) = -1      ! Leading dimensions of the input, output arrays.
134    integer :: gpu_option = ABI_GPU_DISABLED  ! /= 0 if FFTs should be offloaded to the GPU
135 
136    type(c_ptr) :: gpu_plan_ip_spc = c_null_ptr
137    type(c_ptr) :: gpu_data_ip_spc = c_null_ptr
138    type(c_ptr) :: gpu_plan_ip_dpc = c_null_ptr
139    type(c_ptr) :: gpu_data_ip_dpc = c_null_ptr
140    type(c_ptr) :: gpu_plan_op_spc = c_null_ptr
141    type(c_ptr) :: gpu_idata_op_spc = c_null_ptr
142    type(c_ptr) :: gpu_odata_op_spc = c_null_ptr
143    type(c_ptr) :: gpu_plan_op_dpc = c_null_ptr
144    type(c_ptr) :: gpu_idata_op_dpc = c_null_ptr
145    type(c_ptr) :: gpu_odata_op_dpc = c_null_ptr
146 
147  contains
148 
149    procedure :: init => fftbox_plan3_init                 ! Low-level constructor
150    procedure :: from_ngfft => fftbox_plan3_from_ngfft     ! Build object from ngfft.
151    procedure :: execute_ip_spc => fftbox_execute_ip_spc
152    procedure :: execute_ip_dpc => fftbox_execute_ip_dpc
153    procedure :: execute_op_spc => fftbox_execute_op_spc
154    procedure :: execute_op_dpc => fftbox_execute_op_dpc
155 
156    ! Main entry point for performing FFTs on the full box
157    ! complex-to-complex version, operating on complex arrays
158    generic :: execute => execute_ip_spc, &
159                          execute_ip_dpc, &
160                          execute_op_spc, &
161                          execute_op_dpc
162 
163    procedure :: free => fftbox_plan3_free
164    ! Free dynamic memory
165 
166  end type fftbox_plan3_t

m_fft/fftbox_utests [ Functions ]

[ Top ] [ m_fft ] [ Functions ]

NAME

  fftbox_utests

FUNCTION

 Driver routine for base unitary tests of the FFT routines (sequential version).

INPUTS

 fftalg =fftalg input variable.
 ndat = Number of transform to execute
 nthreads = Number of OpenMP threads.
 gpu_option=  GPU version to active (0: no GPU).
 [unit]=Output Unit number (DEFAULT std_out)

OUTPUT

  nfailed=number of failures.

SOURCE

1197 integer function fftbox_utests(fftalg, ndat, nthreads, gpu_option, unit) result(nfailed)
1198 
1199 !Arguments -----------------------------------
1200 !scalars
1201  integer,intent(in) :: fftalg, ndat, nthreads, gpu_option
1202  integer,optional,intent(in) :: unit
1203 
1204 !Local variables-------------------------------
1205 !scalars
1206  integer,parameter :: NSETS=6, fftcache0 = 0
1207  integer :: ifft,ierr,ldxyz,old_nthreads,ount,cplex
1208  integer :: iset,nx,ny,nz,ldx,ldy,ldz,fftalga,fftalgc
1209  !integer :: ix,iy,iz,padat,dat
1210  real(dp),parameter :: ATOL_SP=tol6,ATOL_DP=tol12 ! Tolerances on the absolute error
1211  real(dp) :: max_abserr
1212  character(len=500) :: msg,info,library,cplex_mode,padding_mode
1213  type(fftbox_plan3_t) :: box_plan
1214 !arrays
1215  integer :: pars(6,NSETS)
1216  real(dp) :: crand(2)
1217  real(dp),allocatable :: fofg(:),fofr_ref(:),fofr(:)
1218  complex(dpc),allocatable :: ff(:),ff_ref(:),gg(:)
1219  complex(spc),allocatable :: ffsp(:),ff_refsp(:),ggsp(:)
1220 
1221 ! *************************************************************************
1222 
1223  nfailed = 0
1224  ount = std_out; if (PRESENT(unit)) ount = unit
1225 
1226  if (nthreads > 0) then
1227    old_nthreads = xomp_get_max_threads()
1228    call xomp_set_num_threads(nthreads)
1229  end if
1230 
1231  ! These values must be compatible with all the FFT routines.
1232  ! SG library is the most restrictive (only powers of 2,3,5).
1233  pars = RESHAPE( [   &
1234    12, 18, 15, 12, 18, 15, &
1235    12, 18, 15, 13, 19, 16, &
1236    12, 18, 15, 13, 19, 15, &
1237    12, 18, 15, 12, 18, 16, &
1238    12, 18, 15, 13, 18, 15, &
1239    12, 18, 15, 15, 21, 18  &
1240  ], [6, NSETS])
1241 
1242  fftalga=fftalg/100; fftalgc=mod(fftalg,10)
1243 
1244  call fftalg_info(fftalg, library, cplex_mode, padding_mode)
1245 
1246  do iset=1,SIZE(pars,DIM=2)
1247    nx =pars(1,iset);  ny=pars(2,iset);  nz=pars(3,iset)
1248    ldx=pars(4,iset); ldy=pars(5,iset); ldz=pars(6,iset)
1249 
1250    ! Create the FFT plan
1251    call box_plan%init(ndat, pars(1,iset), pars(4,iset), fftalg, fftcache0, gpu_option)
1252 
1253    ldxyz = ldx*ldy*ldz
1254    !
1255    ! ======================================
1256    ! === TEST the single precision version
1257    ! ======================================
1258    ABI_MALLOC(ff_refsp, (ldxyz*ndat))
1259    ABI_MALLOC(ffsp,     (ldxyz*ndat))
1260    ABI_MALLOC(ggsp,     (ldxyz*ndat))
1261 
1262    do ifft=1,ldxyz*ndat
1263      call RANDOM_NUMBER(crand)
1264      ff_refsp(ifft) = DCMPLX(crand(1), crand(2))
1265    end do
1266 
1267    ! Set the augmentation region to zero to avoid SIGFPE, as FFTW3 wrappers use zscal to scale the results.
1268    call cplx_setaug_zero_spc(nx,ny,nz,ldx,ldy,ldz,ndat,ff_refsp)
1269    ffsp = ff_refsp
1270 
1271    ! in-place version.
1272    call box_plan%execute(ffsp, +1)
1273    call box_plan%execute(ffsp, -1)
1274 
1275    ! do it twice to test GPU version
1276    !call box_plan%execute(ffsp, +1)
1277    !call box_plan%execute(ffsp, -1)
1278 
1279    ierr = COUNT(ABS(ffsp - ff_refsp) > ATOL_SP)
1280    nfailed = nfailed + ierr
1281 
1282    info = sjoin(library, "c2c_ip_spc :")
1283    if (ierr /= 0) then
1284      max_abserr = MAXVAL(ABS(ffsp - ff_refsp))
1285      write(msg,"(a,es9.2,a)")" FAILED (max_abserr = ",max_abserr,")"
1286    else
1287      write(msg,"(a)")" OK"
1288    end if
1289    call wrtout(ount,sjoin(info,msg))
1290 
1291    ! out-of-place version.
1292    ffsp = ff_refsp
1293    call box_plan%execute(ffsp, ggsp, +1)
1294    call box_plan%execute(ggsp, ffsp, -1)
1295 
1296    ierr = COUNT(ABS(ffsp - ff_refsp) > ATOL_SP)
1297    nfailed = nfailed + ierr
1298 
1299    info = sjoin(library, "c2c_op_spc :")
1300    if (ierr /= 0) then
1301      max_abserr = MAXVAL(ABS(ffsp - ff_refsp))
1302      write(msg,"(a,es9.2,a)")" FAILED (max_abserr = ",max_abserr,")"
1303    else
1304      write(msg,"(a)")" OK"
1305    end if
1306    call wrtout(ount, sjoin(info, msg))
1307 
1308    ABI_FREE(ff_refsp)
1309    ABI_FREE(ffsp)
1310    ABI_FREE(ggsp)
1311 
1312    ! =======================================
1313    ! === TEST the double precision version
1314    ! =======================================
1315    ABI_MALLOC(ff_ref, (ldxyz*ndat))
1316    ABI_MALLOC(ff,     (ldxyz*ndat))
1317    ABI_MALLOC(gg,     (ldxyz*ndat))
1318 
1319    do ifft=1,ldxyz*ndat
1320      call RANDOM_NUMBER(crand)
1321      ff_ref(ifft) = DCMPLX(crand(1), crand(2))
1322    end do
1323 
1324    ! Set the augmentation region to zero to avoid SIGFPE, as FFTW3 wrappers use zscal to scale the results.
1325    call cplx_setaug_zero_dpc(nx,ny,nz,ldx,ldy,ldz,ndat,ff_ref)
1326    ff = ff_ref
1327 
1328    ! in-place version.
1329    call box_plan%execute(ff, +1)
1330    call box_plan%execute(ff, -1)
1331 
1332    ierr = COUNT(ABS(ff - ff_ref) > ATOL_DP)
1333    nfailed = nfailed + ierr
1334 
1335    info = sjoin(library, "c2c_ip_dpc :")
1336    if (ierr /= 0) then
1337      max_abserr = MAXVAL(ABS(ff - ff_ref))
1338      write(msg,"(a,es9.2,a)")" FAILED (max_abserr = ",max_abserr,")"
1339    else
1340      write(msg,"(a)")" OK"
1341    end if
1342    call wrtout(ount,sjoin(info, msg))
1343 
1344    ! out-of-place version.
1345    ff = ff_ref
1346    call box_plan%execute(ff, gg, +1)
1347    call box_plan%execute(gg, ff, -1)
1348 
1349    ierr = COUNT(ABS(ff - ff_ref) > ATOL_DP)
1350    nfailed = nfailed + ierr
1351 
1352    info = sjoin(library, "c2c_op_dpc :")
1353    if (ierr /= 0) then
1354      max_abserr = MAXVAL(ABS(ff - ff_ref))
1355      write(msg,"(a,es9.2,a)")" FAILED (max_abserr = ",max_abserr,")"
1356    else
1357      write(msg,"(a)")" OK"
1358    end if
1359    call wrtout(ount, sjoin(info, msg))
1360 
1361    ABI_FREE(ff_ref)
1362    ABI_FREE(ff)
1363    ABI_FREE(gg)
1364 
1365    call box_plan%free()
1366    !stop
1367 
1368    do cplex=1,2
1369      !if (fftalga == FFT_FFTW3 .and. ndat > 1 .and. cplex==1) then
1370      !  call wrtout(ount,"Warning: fourdp with FFTW3-wrappers, cplex=2 and ndat>1, might crash if MKL is used")
1371      !  !CYCLE
1372      !end if
1373      ABI_MALLOC(fofg,     (2*ldxyz*ndat))
1374      ABI_MALLOC(fofr_ref, (cplex*ldxyz*ndat))
1375      ABI_MALLOC(fofr,     (cplex*ldxyz*ndat))
1376 
1377      call RANDOM_NUMBER(fofr_ref)
1378      !call cg_setaug_zero(cplex,nx,ny,nz,ldx,ldy,ldz,ndat,fofr_ref)
1379      fofr = fofr_ref
1380 
1381      select case (fftalga)
1382      case (FFT_FFTW3)
1383        call fftw3_seqfourdp(cplex,nx,ny,nz,ldx,ldy,ldz,ndat,-1,fofg,fofr)
1384        call fftw3_seqfourdp(cplex,nx,ny,nz,ldx,ldy,ldz,ndat,+1,fofg,fofr)
1385 
1386      case (FFT_DFTI)
1387        call dfti_seqfourdp(cplex,nx,ny,nz,ldx,ldy,ldz,ndat,-1,fofg,fofr)
1388        call dfti_seqfourdp(cplex,nx,ny,nz,ldx,ldy,ldz,ndat,+1,fofg,fofr)
1389 
1390      case default
1391        ! TODO
1392        continue
1393      end select
1394 
1395      !call cg_setaug_zero(cplex,nx,ny,nz,ldx,ldy,ldz,ndat,fofr)
1396 
1397      ierr = COUNT(ABS(fofr - fofr_ref) > ATOL_DP)
1398      nfailed = nfailed + ierr
1399 
1400      write(info,"(a,i1,a)")sjoin(library, "fourdp (cplex "),cplex,") :"
1401      !write(info,"(2a,i1,a,i0,a)")trim(library), "fourdp (cplex ", cplex,"), ndata = ",ndat," :"
1402      if (ierr /= 0) then
1403        max_abserr = MAXVAL(ABS(fofr - fofr_ref))
1404        write(msg,"(a,es9.2,a)")" FAILED (max_abserr = ",max_abserr,")"
1405      else
1406        write(msg,"(a)")" OK"
1407      end if
1408      call wrtout(ount,sjoin(info, msg))
1409 
1410      ABI_FREE(fofg)
1411      ABI_FREE(fofr_ref)
1412      ABI_FREE(fofr)
1413    end do
1414  end do
1415 
1416  if (nthreads > 0) call xomp_set_num_threads(old_nthreads)
1417 
1418 end function fftbox_utests

m_fft/fftmpi_u [ Functions ]

[ Top ] [ m_fft ] [ Functions ]

NAME

 fftmpi_u

FUNCTION

  **** DO NOT USE IT ****
  Wrapper function used to perform the MPI FFT of ndat wavefunctions.
  Mainly used for unit tests and prototyping. Better integration
  will be provided afterwards.

INPUTS

  See fourwf_mpi

OUTPUT

  See fourwf_mpi

SOURCE

4053 ! The final interface should be:
4054 !subroutine fftmpi_u(npw_k,n4,n5,n6,nspinor,ndat,mgfft,ngfft,istwf_k,kg_k,gbound_k,fftabs,isign,fofg,fofr)
4055 
4056 subroutine fftmpi_u(npw_k,n4,n5,n6,ndat,mgfft,ngfft,&
4057                     istwf_k,gbound_k,kg_k,me_g0,distribfft,isign,fofg,fofr,comm_fft,cplexwf)
4058 
4059 !Arguments ------------------------------------
4060 !scalars
4061  integer,intent(in) :: istwf_k,mgfft,n4,n5,n6,ndat,npw_k
4062  integer,intent(in) :: isign,comm_fft,me_g0,cplexwf
4063  type(distribfft_type),intent(in) :: distribfft
4064 !arrays
4065  integer,intent(in) :: gbound_k(2*mgfft+8,2),ngfft(18)
4066  integer,intent(in) :: kg_k(3,npw_k)
4067  real(dp),intent(inout) :: fofg(2,npw_k*ndat),fofr(2,n4,n5,n6*ndat)
4068 
4069 !Local variables-------------------------------
4070 !scalars
4071  integer,parameter :: npw0=0,cplex0=0
4072  integer :: n1,n2,n3
4073  real(dp),parameter :: weight_r=one,weight_i=one
4074 !arrays
4075  integer :: dummy_kg(0,0)
4076  real(dp) :: dummy_denpot(0,0,0),dummy_fofg(0,0)
4077 
4078 ! *************************************************************************
4079 
4080  n1 = ngfft(1); n2 = ngfft(2); n3 = ngfft(3)
4081 
4082  if (isign == 1) then
4083    ! option 0 G --> R
4084    call fourwf_mpi(cplex0,dummy_denpot,fofg,dummy_fofg,fofr,&
4085      gbound_k,gbound_k,istwf_k,kg_k,dummy_kg,me_g0,mgfft,ngfft,distribfft,n1,n2,n3,&
4086      npw_k,npw0,n4,n5,n6,ndat,0,weight_r,weight_i,comm_fft,cplexwf=cplexwf)
4087  else
4088    ! option 3 R --> G
4089    call fourwf_mpi(cplex0,dummy_denpot,dummy_fofg,fofg,fofr,&
4090      gbound_k,gbound_k,istwf_k,dummy_kg,kg_k,me_g0,mgfft,ngfft,distribfft,n1,n2,n3,&
4091      npw0,npw_k,n4,n5,n6,ndat,3,weight_r,weight_i,comm_fft,cplexwf=cplexwf)
4092  end if
4093 
4094 end subroutine fftmpi_u

m_fft/fftpac [ Functions ]

[ Top ] [ m_fft ] [ Functions ]

NAME

 fftpac

FUNCTION

 Allow for data copying to modify the stride (dimensioning) of a three-dimensional array,
 for more efficient three dimensional fft.
 NOTE that the arrays are in REAL space.

 Note that arrays aa and bb may be the same array (start at the same address).
 The array aa also incorporate a spin variable.
 MG FIXME: THIS IS **VERY BAD** AS FORTRAN DOES NOT ALLOW FOR ALIASING

INPUTS

  ispden=actual spin-density of interest
  nspden=number of spin-density components
  n1,n2,n3=actual data dimensions, dimensions of complex array a
  nd1,nd2,nd3=array dimensions of (larger) array b
  ngfft(18)=contain all needed information about 3D FFT, see ~abinit/doc/variables/vargs.htm#ngfft
  option= see description of side effects

OUTPUT

  (see side effects)

SIDE EFFECTS

  aa & bb arrays are treated as input or output depending on option:
  option=1  aa(n1*n2*n3,ispden) <-- bb(nd1,nd2,nd3) real case
  option=2  aa(n1*n2*n3,ispden) --> bb(nd1,nd2,nd3) real case
  option=10 aa(n1*n2*n3,ispden) <-- bb(nd1,nd2,nd3) complex case like option 1 real part
  option=11 aa(n1*n2*n3,ispden) <-- bb(nd1,nd2,nd3) complex case like option 1 imag part

SOURCE

4418 subroutine fftpac(ispden,mpi_enreg,nspden,n1,n2,n3,nd1,nd2,nd3,ngfft,aa,bb,option)
4419 
4420 !Arguments ------------------------------------
4421 !scalars
4422  integer,intent(in) :: ispden,n1,n2,n3,nd1,nd2,nd3,nspden,option
4423  type(mpi_type),intent(in) :: mpi_enreg
4424 !arrays
4425  integer,intent(in) :: ngfft(18)
4426  real(dp),intent(inout) :: aa(n1*n2*n3/ngfft(10),nspden),bb(nd1,nd2,nd3)
4427 
4428 !Local variables-------------------------------
4429 !scalars
4430  integer :: i1,i2,i3,index,me_fft,nproc_fft
4431  character(len=500) :: msg
4432  !arrays
4433  integer, ABI_CONTIGUOUS pointer :: fftn2_distrib(:),ffti2_local(:)
4434  integer, ABI_CONTIGUOUS pointer :: fftn3_distrib(:),ffti3_local(:)
4435 
4436 ! *************************************************************************
4437 
4438  me_fft=ngfft(11); nproc_fft=ngfft(10)
4439 
4440  if (option==1.or.option==2) then
4441    if (nd1<n1.or.nd2<n2.or.nd3<n3) then
4442      write(msg,'(a,3i0,2a,3i0,a)')&
4443       'Each of nd1,nd2,nd3=',nd1,nd2,nd3,ch10,&
4444       'must be >= n1, n2, n3 =',n1,n2,n3,'.'
4445      ABI_BUG(msg)
4446    end if
4447  else
4448    if (2*nd1<n1.or.nd2<n2.or.nd3<n3) then
4449      write(msg,'(a,3i0,2a,3i0,a)')&
4450      'Each of 2*nd1,nd2,nd3=',2*nd1,nd2,nd3,ch10,&
4451      'must be >= (n1, n2, n3) =',n1,n2,n3,'.'
4452      ABI_BUG(msg)
4453    end if
4454  end if
4455 
4456  ! Get the distrib associated with this fft_grid
4457  call ptabs_fourdp(mpi_enreg,n2,n3,fftn2_distrib,ffti2_local,fftn3_distrib,ffti3_local)
4458 
4459  if (option==1) then
4460    ! aa(n1*n2*n3,ispden) <-- bb(nd1,nd2,nd3) real case
4461    do i3=1,n3
4462      if (me_fft==fftn3_distrib(i3)) then
4463        do i2=1,n2
4464          do i1=1,n1
4465            aa(i1+n1*(i2-1+n2*(ffti3_local(i3)-1)),ispden)=bb(i1,i2,i3)
4466          end do
4467        end do
4468      end if
4469    end do
4470 
4471  else if (option==2) then
4472    !  option=2  aa(n1*n2*n3,ispden) --> bb(nd1,nd2,nd3) real case
4473    !  Here we avoid corrupting the data in a while writing to b in the
4474    !  case in which a and b are same array.
4475    !  Also: replace "trash" data with 0 s to avoid floating point
4476    !  exceptions when this data is actually manipulated in fft.
4477    do i3=nd3,n3+1,-1
4478      do i2=nd2,1,-1
4479        do i1=nd1,1,-1
4480          bb(i1,i2,i3)=0.d0
4481        end do
4482      end do
4483    end do
4484    do i3=n3,1,-1
4485      if (me_fft==fftn3_distrib(i3)) then
4486        do i2=nd2,n2+1,-1
4487          do i1=nd1,1,-1
4488            bb(i1,i2,i3)=0.d0
4489          end do
4490        end do
4491        do i2=n2,1,-1
4492          do i1=nd1,n1+1,-1
4493            bb(i1,i2,i3)=0.d0
4494          end do
4495          do i1=n1,1,-1
4496            bb(i1,i2,i3)=aa(i1+n1*(i2-1+n2*(ffti3_local(i3) - 1)),ispden)
4497          end do
4498        end do
4499      end if
4500    end do
4501 !  MF
4502  else if (option==10 .or. option==11) then
4503    ! option=10 aa(n1*n2*n3,ispden) <-- bb(nd1,nd2,nd3) complex case like option 1 real part
4504    ! option=11 aa(n1*n2*n3,ispden) <-- bb(nd1,nd2,nd3) complex case like option 1 imag part
4505    index=1
4506    if(option==11) index=2
4507    do i3=1,n3
4508      do i2=1,n2
4509        do i1=1,n1/2
4510          aa(index,ispden)=bb(i1,i2,i3)
4511          index=index+2
4512        end do
4513      end do
4514    end do
4515 !  MF
4516  else
4517    write(msg,'(a,i0,a)')' Bad option =',option,'.'
4518    ABI_BUG(msg)
4519  end if
4520 
4521 end subroutine fftpac

m_fft/fftpad_dpc [ Functions ]

[ Top ] [ m_fft ] [ Functions ]

NAME

  fftpad_dpc

FUNCTION

  Driver routine used to transform COMPLEX arrays using 3D zero-padded FFTs.
  TARGET: DPC arrays

INPUTS

  ngfft(18)=Info on the 3D FFT.
  nx,ny,nz=Logical dimensions of the FFT mesh.
  ldx,ldy,ldz=Physical dimension of the f array (to avoid cache conflicts).
  ndat=Number of FFTs
  mgfft=MAX(nx,ny,nz), only used to dimension gbound
  isign=The sign of the transform.
  gbound(2*mgfft+8,2)= The boundaries of the basis sphere of G vectors at a given k-point. See sphereboundary for more info.

SIDE EFFECTS

  ff(ldx*ldy*ldz*ndat)=
    input: The array with the data to be transformed.
    output: The results of the FFT.

SOURCE

1008 subroutine fftpad_dpc(ff, ngfft, nx, ny, nz, ldx, ldy, ldz, ndat, mgfft, isign, gbound)
1009 
1010 !Arguments ------------------------------------
1011 !scalars
1012  integer,intent(in) :: nx,ny,nz,ldx,ldy,ldz,ndat,mgfft,isign
1013 !arrays
1014  integer,intent(in) :: ngfft(18),gbound(2*mgfft+8,2)
1015  complex(dpc),target,intent(inout) :: ff(ldx*ldy*ldz*ndat)
1016 
1017 !Local variables-------------------------------
1018 !scalars
1019  integer :: fftalg,fftalga,fftalgc,ncount
1020  integer :: ivz !vz_d
1021  character(len=500) :: msg
1022 !arrays
1023  real(dp),allocatable :: fofr(:,:,:,:,:)
1024  real(dp),allocatable :: fofrvz(:,:) !vz_d
1025  real(dp),ABI_CONTIGUOUS pointer :: fpt_ftarr(:,:,:,:,:)
1026 
1027 ! *************************************************************************
1028 
1029  fftalg=ngfft(7); fftalga=fftalg/100; fftalgc=MOD(fftalg,10)
1030 
1031  select case (fftalga)
1032 
1033  case (FFT_FFTW3)
1034    call fftw3_fftpad(ff, nx, ny, nz, ldx, ldy, ldz, ndat, mgfft, isign, gbound)
1035 
1036  case (FFT_DFTI)
1037    call dfti_fftpad(ff, nx, ny, nz, ldx, ldy, ldz, ndat, mgfft, isign, gbound)
1038 
1039  case (FFT_SG)
1040    ! Goedecker"s routines.
1041    ! TODO: sg_fftpad is not the fastest routine, here I should call sg_fftrisc but I need
1042    ! kg_kin that is not available in rho_tw_g, actually one should pass G-G0 due to
1043    ! the shift introduced by the umklapp.
1044    ncount = ldx*ldy*ldz*ndat
1045 
1046    ABI_MALLOC(fofr, (2,ldx,ldy,ldz,ndat))
1047 !  call ZCOPY(ncount,ff,1,fofr,1) !vz_d
1048 !  call DCOPY(2*ncount,ff,1,fofr,1)  ! MG
1049    ! alternatif of ZCOPY from vz
1050    ABI_MALLOC(fofrvz,(2,ncount))     !vz_d
1051    do ivz=1,ncount                !vz_d
1052      fofrvz(1,ivz)= real(ff(ivz))  !vz_d
1053      fofrvz(2,ivz)=aimag(ff(ivz))  !vz_d
1054    end do                         !vz_d
1055    call DCOPY(2*ncount,fofrvz,1,fofr,1) !vz_d
1056    ABI_FREE(fofrvz)             !vz_d
1057 
1058    call C_F_pointer(C_loc(ff),fpt_ftarr, shape=(/2,ldx,ldy,ldz,ndat/))
1059 
1060    !  MT nov. 2012: with xlf, need to put explicit boundaries for the 4th dimension
1061    !  this looks like a compiler bug...
1062    !if (size(fpt_ftarr)==2*size(ff)) then
1063    call sg_fftpad(ngfft(8),mgfft,nx,ny,nz,ldx,ldy,ldz,ndat,gbound,isign,fofr,fpt_ftarr)
1064    !else
1065    !  call sg_fftpad(ngfft(8),mgfft,nx,ny,nz,ldx,ldy,ldz,ndat,gbound,isign,fofr,fpt_ftarr(:,:,:,1:ldz,dat))
1066    !end if
1067 
1068    ABI_FREE(fofr)
1069 
1070    if (isign==-1) then  ! Here there might be numerical exceptions due to the holes.
1071      call xscal(ncount,one/(nx*ny*nz),ff,1)
1072    end if
1073 
1074  case default
1075    write(msg,'(a,i0,a)')"fftalga = ", fftalga," not coded "
1076    ABI_ERROR(msg)
1077  end select
1078 
1079 end subroutine fftpad_dpc

m_fft/fftpad_spc [ Functions ]

[ Top ] [ m_fft ] [ Functions ]

NAME

  fftpad_spc

FUNCTION

  Driver routine used to transform COMPLEX arrays using 3D zero-padded FFTs.
  TARGET: single-precision complex.

INPUTS

  ngfft(18)=Info on the 3D FFT.
  nx,ny,nz=Logical dimensions of the FFT mesh.
  ldx,ldy,ldz=Physical dimension of the f array (to avoid cache conflicts).
  ndat=Number of FFTs
  mgfft=MAX(nx,ny,nz), only used to dimension gbound
  isign=The sign of the transform.
  gbound(2*mgfft+8,2)= The boundaries of the basis sphere of G vectors at a given k-point.
    See sphereboundary for more info.

SIDE EFFECTS

  ff(ldx*ldy*ldz*ndat)=
    input: The array with the data to be transformed.
    output: The results of the FFT.

SOURCE

917 subroutine fftpad_spc(ff, ngfft, nx, ny, nz, ldx, ldy, ldz, ndat, mgfft, isign, gbound)
918 
919 !Arguments ------------------------------------
920 !scalars
921  integer,intent(in) :: nx,ny,nz,ldx,ldy,ldz,ndat,mgfft,isign
922 !arrays
923  integer,intent(in) :: ngfft(18),gbound(2*mgfft+8,2)
924  complex(spc),target,intent(inout) :: ff(ldx*ldy*ldz*ndat)
925 
926 !Local variables-------------------------------
927 !scalars
928  integer :: fftalg,fftalga,fftalgc,ncount,p
929  character(len=500) :: msg
930 !arrays
931  real(dp),allocatable :: fofr(:,:),ftarr(:,:)
932 
933 ! *************************************************************************
934 
935  fftalg=ngfft(7); fftalga=fftalg/100; fftalgc=MOD(fftalg,10)
936 
937  select case (fftalga)
938 
939  case (FFT_FFTW3)
940    call fftw3_fftpad(ff, nx, ny, nz, ldx, ldy, ldz, ndat, mgfft, isign, gbound)
941 
942  case (FFT_DFTI)
943    call dfti_fftpad(ff, nx, ny, nz, ldx, ldy, ldz, ndat, mgfft, isign, gbound)
944 
945  case (FFT_SG)
946    ! Goedecker"s routines.
947    ! TODO: sg_fftpad is not the fastest routine, here I should call sg_fftrisc but I need
948    ! kg_kin that are not available in rho_tw_g, actually one should pass G-G0 due to
949    ! the shift introduced by the umklapp.
950    ncount = ldx*ldy*ldz*ndat
951 
952    ABI_MALLOC(fofr,(2,ldx*ldy*ldz*ndat))
953    ABI_MALLOC(ftarr,(2,ldx*ldy*ldz*ndat))
954 
955    do p=1,ldx*ldy*ldz*ndat
956      fofr(1,p) = REAL(ff(p))
957      fofr(2,p) = AIMAG(ff(p))
958    end do
959 
960    call sg_fftpad(ngfft(8),mgfft,nx,ny,nz,ldx,ldy,ldz,ndat,gbound,isign,fofr,ftarr)
961    !
962    ! Copy the results.
963    do p=1,ldx*ldy*ldz*ndat
964      ff(p) = CMPLX(ftarr(1,p), ftarr(2,p))
965    end do
966 
967    if (isign==-1) then  ! Here there might be numerical exceptions due to the holes.
968      ff = ff/(nx*ny*nz)
969    end if
970 
971    ABI_FREE(ftarr)
972    ABI_FREE(fofr)
973 
974  case default
975    write(msg,'(a,i0,a)')"fftalga = ", fftalga," not coded "
976    ABI_ERROR(msg)
977  end select
978 
979 end subroutine fftpad_spc

m_fft/fftu_mpi_utests [ Functions ]

[ Top ] [ m_fft ] [ Functions ]

NAME

 fftu_mpi_utests

FUNCTION

 Unit tests for the FFTs of wavefunctions (MPI version).

INPUTS

OUTPUT

  nfailed=number of failed tests.

SOURCE

1832 function fftu_mpi_utests(fftalg, ecut, rprimd, ndat, nthreads, comm_fft, paral_kgb, unit) result(nfailed)
1833 
1834 !Arguments ------------------------------------
1835 !scalars
1836  integer,intent(in) :: fftalg,ndat,nthreads,comm_fft,paral_kgb
1837  integer :: nfailed
1838  integer,optional,intent(in) :: unit
1839  real(dp),intent(in) :: ecut
1840 !arrays
1841  real(dp),intent(in) :: rprimd(3,3)
1842 
1843 !Local variables-------------------------------
1844 !scalars
1845  integer,parameter :: nsym1=1,npw0=0,cplex_one=1,istwfk_one=1
1846  integer :: n1,n2,n3,idat,n4,n5,n6,ierr,npw_k,full_npw_k,istwf_npw_k,cplexwf
1847  integer :: mgfft,istwf_k,ikpt,old_nthreads,ount,isign,fftalga,fftalgc
1848  integer :: ig,i1,i2,i3,i3_glob,i3dat,nd3proc,i3_local,g0sender
1849  integer :: step,me_g0,me_fft,nproc_fft,mpierr,nfft,cplex,chksymtnons
1850  real(dp),parameter :: boxcutmin2=two,ATOL_DP=tol12,RTOL_DP=tol3 ! Tolerances on the absolute and relative error
1851  real(dp),parameter :: weight_r=one,weight_i=one
1852  real(dp) :: max_abserr,max_relerr,ucvol,relerr,den,refden
1853  real(dp) :: ctime,wtime,gflops
1854  character(len=500) :: msg,info,library,cplex_mode,padding_mode
1855  type(distribfft_type) :: fftabs
1856 !arrays
1857  integer :: symrel(3,3,nsym1),ngfft(18)
1858  integer,allocatable :: full_kg_k(:,:),istw_kg_k(:,:)
1859  integer,allocatable :: gbound_k(:,:),kg_k(:,:)
1860  real(dp) :: dummy_fofg(0,0) !dummy_denpot(0,0,0)
1861  real(dp) :: kpoint(3),kpoints(3,2)
1862  real(dp) :: gmet(3,3),gprimd(3,3),rmet(3,3),tnons(3,nsym1)
1863  real(dp),allocatable :: fofg(:,:),ref_fofg(:,:),fofg_out(:,:),fofr(:,:,:,:)
1864  real(dp),allocatable :: density(:,:,:),pot(:,:,:),invpot(:,:,:)
1865  real(dp),allocatable :: full_fofg(:,:),istwf_fofg(:,:)
1866 
1867 ! *************************************************************************
1868 
1869  nfailed = 0
1870 
1871  ount = std_out; if (PRESENT(unit)) ount = unit
1872 
1873  if (nthreads > 0) then
1874    old_nthreads = xomp_get_max_threads()
1875    call xomp_set_num_threads(nthreads)
1876  end if
1877 
1878  if (.not. fftalg_has_mpi(fftalg)) then
1879    write(msg,'(a,i0,a)')"fftalg: ",fftalg," does not support MPI"
1880    ABI_ERROR(msg)
1881  end if
1882 
1883  nproc_fft = xmpi_comm_size(comm_fft); me_fft = xmpi_comm_rank(comm_fft)
1884 
1885  symrel = reshape([1,0,0,0,1,0,0,0,1],[3,3,nsym1])
1886  tnons=zero
1887  chksymtnons=0
1888  call metric(gmet,gprimd,-1,rmet,rprimd,ucvol)
1889 
1890  kpoints = RESHAPE( [ &
1891    0.1, 0.2, 0.3, &
1892    0.0, 0.0, 0.0 ], [3,2] )
1893 
1894  call fftalg_info(fftalg,library,cplex_mode,padding_mode)
1895 
1896  fftalga=fftalg/100; fftalgc=mod(fftalg,10)
1897 
1898  do ikpt=1,size(kpoints, dim=2)
1899    kpoint = kpoints(:,ikpt)
1900 
1901    ! Get full G-sphere (no MPI distribution, no time-reversal)
1902    call get_kg(kpoint,istwfk_one,ecut,gmet,full_npw_k,full_kg_k)
1903 
1904    ! Get full G-sphere (no MPI distribution, use time-reversal if possible)
1905    istwf_k = set_istwfk(kpoint)
1906    call get_kg(kpoint,istwf_k,ecut,gmet,istwf_npw_k,istw_kg_k)
1907 
1908    ! Get FFT box dims.
1909    ngfft = -1
1910    ngfft(7) = fftalg
1911    ngfft(8) = get_cache_kb()
1912 
1913    call getng(boxcutmin2,chksymtnons,ecut,gmet,kpoint,me_fft,mgfft,nfft,ngfft,nproc_fft,nsym1,&
1914               paral_kgb,symrel,tnons,unit=dev_null)
1915 
1916    n1 = ngfft(1); n2 = ngfft(2); n3 = ngfft(3)
1917    ! Do not use augmentation.
1918    !ngfft(4:6) = ngfft(1:3)
1919    n4 = ngfft(4); n5 = ngfft(5); n6 = ngfft(6)
1920 
1921    call print_ngfft(ngfft,"ngfft for MPI-fourwf",unit=std_out,mode_paral="COLL",prtvol=0)
1922 
1923    ! Compute FFT distribution tables.
1924    call init_distribfft(fftabs,"c",nproc_fft,n2,n3)
1925 
1926    ! Set to 1 if this node owns G = 0.
1927    me_g0 = 0; if (fftabs%tab_fftwf2_distrib(1) == me_fft) me_g0 = 1
1928    g0sender = fftabs%tab_fftwf2_distrib(1)
1929 
1930    ! Allocate u(g) on the full sphere, initialize with random numbers.
1931    ! and broadcast full data to the other nodes.
1932    ABI_MALLOC(full_fofg, (2,full_npw_k*ndat))
1933 
1934    if (me_fft == g0sender) then
1935      if (istwf_k == 1) then
1936        call RANDOM_NUMBER(full_fofg)
1937        !full_fofg = one
1938 
1939      else if (istwf_k == 2) then
1940        ! Special treatment for real wavefunctions.
1941        ! Fill the irreducible G-vectors first so that we have the u(g) of a real u(r)
1942        ! Then get u(g) on the full G-sphere.
1943        ! TODO: Sequential version OK, SIGSEV if mpi_ncpus > 1!
1944        ABI_MALLOC(istwf_fofg, (2,istwf_npw_k*ndat))
1945        call RANDOM_NUMBER(istwf_fofg)
1946        ! Enforce real u in G-space.
1947        do idat=1,ndat
1948          istwf_fofg(2,1+(idat-1)*istwf_npw_k) = zero
1949        end do
1950 
1951        ! from istwfk 2 to 1.
1952        call change_istwfk(istwf_npw_k,istw_kg_k,istwf_k,full_npw_k,full_kg_k,istwfk_one,n1,n2,n3,ndat,istwf_fofg,full_fofg)
1953        ABI_FREE(istwf_fofg)
1954 
1955      else
1956        ABI_ERROR("istwf_k /= [1,2] not available in MPI-FFT mode")
1957      end if
1958    end if
1959 
1960    call xmpi_bcast(full_fofg,g0sender,comm_fft,ierr)
1961 
1962    ! Compute sphere boundaries for zero-padded FFTs
1963    ABI_MALLOC(gbound_k,(2*mgfft+8,2))
1964    call sphereboundary(gbound_k,istwfk_one,full_kg_k,mgfft,full_npw_k)
1965 
1966    ! Extract my G-vectors from full_kg_k and store them in kg_k.
1967    !write(std_out,*)"fftwf2_distrib",fftabs%tab_fftwf2_distrib
1968    do step=1,2
1969      if (step == 2) then
1970        ! Allocate my u(g) and my Gs.
1971        ! Set fofg to zero to bypass a bug with XLF!
1972        ABI_MALLOC(kg_k, (3, npw_k))
1973        ABI_CALLOC(fofg, (2,npw_k*ndat))
1974      end if
1975 
1976      npw_k = 0
1977      do ig=1,full_npw_k
1978        i1=full_kg_k(1,ig); if(i1<0) i1=i1+n1; i1=i1+1
1979        i2=full_kg_k(2,ig); if(i2<0) i2=i2+n2; i2=i2+1
1980        i3=full_kg_k(3,ig); if(i3<0) i3=i3+n3; i3=i3+1
1981        if (fftabs%tab_fftwf2_distrib(i2) == me_fft) then
1982          npw_k = npw_k + 1
1983          if (step == 2) then
1984            kg_k(:,npw_k) = full_kg_k(:,ig)
1985            fofg(:,npw_k) = full_fofg(:,ig)
1986          end if
1987        end if
1988      end do
1989    end do ! step
1990 
1991    ABI_FREE(istw_kg_k)
1992 
1993    !write(std_out,*)"dist",trim(itoa(me_fft)),fftabs%tab_fftdp3_distrib(:) !== me_fft)
1994    !write(std_out,*)"local",trim(itoa(me_fft)),fftabs%tab_fftdp3_local(:)
1995 
1996    ! Allocate my local portion of u(r), and keep a copy my u(g).
1997    ABI_MALLOC(fofr, (2,n4,n5,n6*ndat))
1998    ABI_MALLOC(ref_fofg, (2,npw_k*ndat))
1999    ref_fofg = fofg
2000 
2001    call cwtime(ctime,wtime,gflops,"start")
2002 
2003    ! ----------------------------------------------------------------
2004    ! Test the the backward and the forward transform of wavefunctions
2005    ! ----------------------------------------------------------------
2006    ! c2c or [c2r, r2c]
2007    cplexwf = 2; if (istwf_k==2) cplexwf = 1
2008    ! FIXME:
2009    ! There's a bug somewhere in the MPI routines if cplexwf == 1 is used and ndat > 1
2010    ! Not a serious problem at present since cplexwf==1 is never used in abinit.
2011    if (ndat>1) cplexwf = 2
2012    if (nproc_fft > 1) cplexwf = 2
2013    cplexwf = 2
2014    !cplexwf = 2; if (istwf_k==2) cplexwf = 1
2015 
2016    do isign = 1,-1,-2
2017      call fftmpi_u(npw_k,n4,n5,n6,ndat,mgfft,ngfft,&
2018        istwfk_one,gbound_k,kg_k,me_g0,fftabs,isign,fofg,fofr,comm_fft,cplexwf=cplexwf)
2019    end do
2020 
2021    ! The final interface should be:
2022    !subroutine fftmpi_u(npw_k,n4,n5,n6,nspinor,ndat,mgfft,ngfft,istwf_k,kg_k,gbound_k,fftabs,isign,fofg,fofr)
2023 
2024    ! Parallel version (must support augmentation in forf(:,:,:,:), hence we have a different API wrt the seq case!
2025    !call fftmpi_ug(npw_k,n4,n5,n6,nspinor,ndat,mgfft,ngfft,istwf_k,kg_k,gbound_k,fftabs,fofg,fofr)
2026    !call fftmpi_ur(npw_k,n4,n5,n6,nspinor,ndat,mgfft,ngfft,istwf_k,kg_k,gbound_k,fftabs,fofr,fofgout)
2027 
2028    ! Seq version.
2029    !call fft_ug(npw_k,nxyz,nspinor,ndat,mgfft,ngfft,istwf_k,kg_k,gbound_k,ugsp,ursp)
2030    !call fft_ur(npw_k,nxyz,nspinor,ndat,mgfft,ngfft,istwf_k,kg_k,gbound_k,ursp,ugsp)
2031 
2032    call cwtime(ctime,wtime,gflops,"stop")
2033    if (me_fft == 0) then
2034      !write(std_out,'(a,i0,3(a,f10.4))')"fftalg: ",fftalg,", ecut: ",ecut,", cpu_time: ",ctime,", wall_time: ",wtime
2035    end if
2036 
2037    ! Check if F^{-1} F = I within ATOL_DP
2038    ierr = COUNT(ABS(fofg - ref_fofg) > ATOL_DP)
2039    call xmpi_sum(ierr,comm_fft,mpierr)
2040    nfailed = nfailed + ierr
2041 
2042    write(info,"(a,i1,a)")sjoin(library,"fftu_mpi, istwfk "),istwf_k," :"
2043 
2044    if (ierr /= 0) then
2045      max_abserr = MAXVAL(ABS(fofg - ref_fofg))
2046      call xmpi_max(max_abserr,comm_fft,mpierr)
2047      write(msg,"(a,es9.2,a)")" FAILED (max_abserr = ",max_abserr,")"
2048    else
2049      write(msg,"(a)")" OK"
2050    end if
2051    call wrtout(ount,sjoin(info, msg))
2052 
2053    ! -------------------------------------------
2054    ! Test the accumulation of density (option 1)
2055    ! -------------------------------------------
2056    ABI_CALLOC(density, (cplex_one*n4,n5,n6))
2057    !fofg = one
2058 
2059    ! Accumulate density. Does not work if cplexwf==1
2060    call fourwf_mpi(cplex_one,density,fofg,dummy_fofg,fofr,&
2061      gbound_k,gbound_k,istwfk_one,kg_k,kg_k,me_g0,mgfft,ngfft,fftabs,n1,n2,n3,&
2062      npw_k,npw_k,n4,n5,n6,ndat,1,weight_r,weight_i,comm_fft,cplexwf=cplexwf)
2063 
2064 !   Recompute u(r)
2065 !   call fourwf_mpi(cplex_one,density,fofg,dummy_fofg,fofr,&
2066 !&    gbound_k,gbound_k,istwf_one,kg_k,kg_k,me_g0,mgfft,ngfft,fftabs,n1,n2,n3,&
2067 !&    npw_k,npw_k,n4,n5,n6,ndat,0,weight_r,weight_i,comm_fft,cplexwf=cplexwf)
2068 !   call xmpi_sum(density,comm_fft,ierr)
2069 !   if (me_fft == 0) write(std_out,*)"sum density", sum(density)
2070 
2071    !do i3=1,n6
2072    !  write(110+me_fft,*)i3,density(:,:,i3)
2073    !end do
2074 
2075    max_relerr = zero
2076 
2077    ! FIXME: This is wrong if ndat > 1
2078    ! Must unify the treatment of fofr and rhor on the augmented mesh (n4,n5,n6)
2079    ! back_wf and for_wf return a MPI distributed arrays but fofr is allocated with
2080    ! the global dimensions.
2081    nd3proc=(n3-1)/nproc_fft+1
2082    do i3=1,n3
2083    !do i3=1,nd3proc
2084      if( me_fft == fftabs%tab_fftdp3_distrib(i3) ) then
2085        i3_local = fftabs%tab_fftdp3_local(i3) !+ nd3proc*(idat-1)
2086        !i3_local = i3
2087        i3_glob = i3
2088        !i3_glob = i3_local
2089        !i3_glob = i3 + nd3proc * me_fft
2090        !i3dat = i3  + n6 * (idat-1)
2091        do i2=1,n2
2092          do i1=1,n1
2093             den = density(i1,i2,i3_glob)
2094             refden = zero
2095             do idat=1,ndat
2096               i3dat = i3_local + n6 * (idat-1)
2097               refden = refden + weight_r*fofr(1,i1,i2,i3dat)**2+ weight_i*fofr(2,i1,i2,i3dat)**2
2098             end do
2099             relerr = abs(refden - den)
2100             if (abs(refden) < tol12) refden = tol12
2101             relerr = relerr / abs(refden)
2102             max_relerr = max(max_relerr, relerr)
2103             !if (relerr > RTOL_DP) write(std_out,*)trim(itoa(me_fft)),i1,i2,i3,idat,den,refden
2104          end do
2105        end do
2106      end if
2107    end do
2108 
2109    call xmpi_max(max_relerr,comm_fft,mpierr)
2110 
2111    write(info,"(a,i1,a)")sjoin(library,"accrho_mpi, istwfk "),istwf_k," :"
2112    if (max_relerr > RTOL_DP) then
2113      write(msg,"(a,es9.2,a)")" FAILED (max_relerr = ",max_relerr,")"
2114    else
2115      write(msg,"(a)")" OK"
2116    end if
2117    call wrtout(ount, sjoin(info, msg))
2118 
2119    ABI_FREE(density)
2120 
2121    ! -------------------------------------------
2122    ! Test the application of the local potential
2123    ! -------------------------------------------
2124    cplex = 1
2125    ABI_MALLOC(pot, (cplex*n4,n5,n6))
2126    ABI_MALLOC(invpot, (cplex*n4,n5,n6))
2127 
2128    if (me_fft == g0sender) then
2129      !pot = fofr(1,:,:,:)
2130      !call RANDOM_NUMBER(pot)
2131      !where (abs(pot) < tol4) pot = tol4
2132      ! Simplest potential ever (G=0 to reduce errors due to aliasing)
2133      pot = four
2134      invpot = one/pot
2135    end if
2136 
2137    call xmpi_bcast(pot,g0sender,comm_fft,ierr)
2138    call xmpi_bcast(invpot,g0sender,comm_fft,ierr)
2139 
2140    ABI_MALLOC(fofg_out, (2,npw_k*ndat))
2141 
2142    ! Compute fofg_out = <G|pot(r)|fofg>
2143    call fourwf_mpi(cplex,pot,fofg,fofg_out,fofr,&
2144 &    gbound_k,gbound_k,istwfk_one,kg_k,kg_k,me_g0,mgfft,ngfft,fftabs,n1,n2,n3,&
2145 &    npw_k,npw_k,n4,n5,n6,ndat,2,weight_r,weight_i,comm_fft,cplexwf=cplexwf)
2146 
2147    ! Compute fofg = <G|1/pot(r)|fofg_out>
2148    call fourwf_mpi(cplex,invpot,fofg_out,fofg,fofr,&
2149 &    gbound_k,gbound_k,istwfk_one,kg_k,kg_k,me_g0,mgfft,ngfft,fftabs,n1,n2,n3,&
2150 &    npw_k,npw_k,n4,n5,n6,ndat,2,weight_r,weight_i,comm_fft,cplexwf=cplexwf)
2151 
2152    ! Check if we got the initial u(g) within ATOL_DP
2153    ierr = COUNT(ABS(fofg - ref_fofg) > ATOL_DP)
2154    call xmpi_sum(ierr,comm_fft,mpierr)
2155    nfailed = nfailed + ierr
2156 
2157    write(info,"(a,i1,a)")sjoin(library,"<G|vloc|u>, istwfk "),istwf_k," :"
2158 
2159    if (ierr /= 0) then
2160      max_abserr = MAXVAL(ABS(fofg - ref_fofg))
2161      call xmpi_max(max_abserr,comm_fft,mpierr)
2162      write(msg,"(a,es9.2,a)")" FAILED (max_abserr = ",max_abserr,")"
2163      !if (me_fft == 0) write(std_out,*)(fofg(:,ig),ref_fofg(:,ig), ig=1,npw_k*ndat)
2164    else
2165      write(msg,"(a)")" OK"
2166    end if
2167    call wrtout(ount, sjoin(info, msg))
2168 
2169    ABI_FREE(fofg_out)
2170    ABI_FREE(pot)
2171    ABI_FREE(invpot)
2172    ABI_FREE(kg_k)
2173    ABI_FREE(gbound_k)
2174    ABI_FREE(fofg)
2175    ABI_FREE(ref_fofg)
2176    ABI_FREE(fofr)
2177    ABI_FREE(full_kg_k)
2178    ABI_FREE(full_fofg)
2179 
2180    call destroy_distribfft(fftabs)
2181  end do
2182 
2183  if (nthreads > 0) call xomp_set_num_threads(old_nthreads)
2184 
2185 end function fftu_mpi_utests

m_fft/fftu_utests [ Functions ]

[ Top ] [ m_fft ] [ Functions ]

NAME

 fftu_utests

FUNCTION

 Unit tests for the FFTs of wavefunctions (sequential version).

INPUTS

OUTPUT

  nfailed=number of failed tests.

SOURCE

1437 function fftu_utests(ecut, ngfft, rprimd, ndat, nthreads, unit) result(nfailed)
1438 
1439 !Arguments ------------------------------------
1440 !scalars
1441  integer,intent(in) :: ndat,nthreads
1442  integer :: nfailed
1443  integer,optional,intent(in) :: unit
1444  real(dp),intent(in) :: ecut
1445 !arrays
1446  integer,intent(in) :: ngfft(18)
1447  real(dp),intent(in) :: rprimd(3,3)
1448 
1449 !Local variables-------------------------------
1450 !scalars
1451  integer,parameter :: nspinor1=1,mkmem1=1,exchn2n3d0=0,ikg0=0
1452  integer :: nx,ny,nz,nxyz,ldx,ldy,ldz,ierr,npw_k,mgfft,istwf_k,ikpt,ldxyz,ipw,old_nthreads,ount
1453  integer :: fftalg,npw_k_test
1454  real(dp),parameter :: ATOL_SP=tol6, ATOL_DP=tol12 ! Tolerances on the absolute error
1455  real(dp) :: max_abserr,ucvol
1456  character(len=500) :: msg,info,library,cplex_mode,padding_mode
1457 !arrays
1458  integer :: kg_dum(3,0)
1459  integer,allocatable :: gbound_k(:,:),kg_k(:,:)
1460  real(dp) :: kpoint(3),crand(2),kpoints(3,9)
1461  real(dp) :: gmet(3,3),gprimd(3,3),rmet(3,3)
1462  real(dp),allocatable :: cg(:,:),cg_ref(:,:),cr(:,:)
1463  complex(spc),allocatable :: ugsp(:),ug_refsp(:),ursp(:)
1464  complex(dpc),allocatable :: ug(:),ug_ref(:),ur(:)
1465  type(MPI_type) :: MPI_enreg_seq
1466 
1467 ! *************************************************************************
1468 
1469  ount = std_out; if (PRESENT(unit)) ount = unit
1470 
1471  nfailed = 0
1472  fftalg = ngfft(7)
1473 
1474  if (nthreads > 0) then
1475    old_nthreads = xomp_get_max_threads()
1476    call xomp_set_num_threads(nthreads)
1477  end if
1478 
1479  call metric(gmet,gprimd,-1,rmet,rprimd,ucvol)
1480 
1481  nx  = ngfft(1);  ny = ngfft(2);  nz = ngfft(3)
1482  ldx = ngfft(4); ldy = ngfft(5); ldz = ngfft(6)
1483  mgfft = MAXVAL(ngfft(1:3))
1484 
1485  nxyz =  nx* ny* nz
1486  ldxyz = ldx*ldy*ldz
1487 
1488  ABI_CALLOC(cg_ref, (2, ldxyz*ndat))
1489  ABI_CALLOC(cg,     (2, ldxyz*ndat))
1490  ABI_CALLOC(cr,     (2, ldxyz*ndat))
1491 
1492  ABI_CALLOC(ug_ref, (ldxyz*ndat))
1493  ABI_CALLOC(ug,     (ldxyz*ndat))
1494  ABI_CALLOC(ur,     (ldxyz*ndat))
1495 
1496  ABI_CALLOC(ug_refsp, (ldxyz*ndat))
1497  ABI_CALLOC(ugsp,     (ldxyz*ndat))
1498  ABI_CALLOC(ursp,     (ldxyz*ndat))
1499 
1500  kpoints = RESHAPE([ &
1501    0.1, 0.2, 0.3, &
1502    0.0, 0.0, 0.0, &
1503    0.5, 0.0, 0.0, &
1504    0.0, 0.0, 0.5, &
1505    0.5, 0.0, 0.5, &
1506    0.0, 0.5, 0.0, &
1507    0.5, 0.5, 0.0, &
1508    0.0, 0.5, 0.5, &
1509    0.5, 0.5, 0.5], [3, 9])
1510 
1511  call fftalg_info(fftalg,library,cplex_mode,padding_mode)
1512 
1513  call initmpi_seq(MPI_enreg_seq)
1514 
1515  do ikpt=1,SIZE(kpoints,DIM=2)
1516    kpoint = kpoints(:,ikpt)
1517    istwf_k = set_istwfk(kpoint)
1518 
1519    ! Calculate the number of G-vectors for this k-point.
1520    call kpgsph(ecut,exchn2n3d0,gmet,ikg0,0,istwf_k,kg_dum,kpoint,0,MPI_enreg_seq,0,npw_k)
1521 
1522    ! Allocate and calculate the set of G-vectors.
1523    ABI_MALLOC(kg_k,(3,npw_k))
1524    call kpgsph(ecut,exchn2n3d0,gmet,ikg0,0,istwf_k,kg_k,kpoint,mkmem1,MPI_enreg_seq,npw_k,npw_k_test)
1525 
1526    ABI_MALLOC(gbound_k,(2*mgfft+8,2))
1527    call sphereboundary(gbound_k,istwf_k,kg_k,mgfft,npw_k)
1528 
1529    !if (istwf_k==2) then
1530    !  do ipw=1,npw_k
1531    !    write(std_out,*)ipw, kg_k(:,ipw)
1532    !  end do
1533    !  stop
1534    !end if
1535 
1536 #if 0
1537    !TODO
1538    ! ================================================
1539    ! === Test the double precision 2*real version ===
1540    ! ================================================
1541    do ipw=1,npw_k*ndat
1542      call RANDOM_NUMBER(crand)
1543      cg_ref(:,ipw) = crand(:)
1544    end do
1545 
1546    if (istwf_k == 2) then
1547      do ipw=1,npw_k*ndat,npw_k
1548        cg_ref(2,ipw) = zero
1549      end do
1550    end if
1551 
1552    cg = cg_ref
1553 
1554    call fft_ug_dp(npw_k,nxyz,nspinor1,ndat,mgfft,ngfft,istwf_k,kg_k,gbound_k,cg,cr)
1555    call fft_ur_dp(npw_k,nxyz,nspinor1,ndat,mgfft,ngfft,istwf_k,kg_k,gbound_k,cr,cg)
1556 
1557    ierr = COUNT(ABS(cg - cg_ref) > ATOL_DP)
1558    nfailed = nfailed + ierr
1559 
1560    write(info,"(a,i1,a)")sjoin(library,"fftu_dp, istwfk "),istwf_k," :"
1561    if (ierr /= 0) then
1562      max_abserr = MAXVAL(ABS(cg - cg_ref))
1563      write(msg,"(a,es9.2,a)")" FAILED (max_abserr = ",max_abserr,")"
1564    else
1565      write(msg,"(a)")" OK"
1566    end if
1567    call wrtout(ount,sjoin(info, msg))
1568 #endif
1569 
1570    ! =================================================
1571    ! === Test the single precision complex version ===
1572    ! =================================================
1573    do ipw=1,npw_k*ndat
1574      call RANDOM_NUMBER(crand)
1575      ug_refsp(ipw) = CMPLX(crand(1), crand(2))
1576    end do
1577 
1578    if (istwf_k == 2) then
1579      do ipw=1,npw_k*ndat,npw_k
1580        ug_refsp(ipw) = REAL(ug_refsp(ipw))
1581      end do
1582    end if
1583 
1584    ugsp = ug_refsp
1585 
1586    call fft_ug(npw_k,nxyz,nspinor1,ndat,mgfft,ngfft,istwf_k,kg_k,gbound_k,ugsp,ursp)
1587    call fft_ur(npw_k,nxyz,nspinor1,ndat,mgfft,ngfft,istwf_k,kg_k,gbound_k,ursp,ugsp)
1588 
1589    ierr = COUNT(ABS(ugsp - ug_refsp) > ATOL_SP)
1590    nfailed = nfailed + ierr
1591 
1592    write(info,"(a,i1,a)")sjoin(library,"fftu_spc, istwfk "),istwf_k," :"
1593    if (ierr /= 0) then
1594      max_abserr = MAXVAL(ABS(ugsp - ug_refsp))
1595      write(msg,"(a,es9.2,a)")" FAILED (max_abserr = ",max_abserr,")"
1596    else
1597      write(msg,"(a)")" OK"
1598    end if
1599    call wrtout(ount,sjoin(info, msg))
1600 
1601    ! =================================================
1602    ! === Test the double precision complex version ===
1603    ! =================================================
1604    do ipw=1,npw_k*ndat
1605      call RANDOM_NUMBER(crand)
1606      ug_ref(ipw) = DCMPLX(crand(1), crand(2))
1607    end do
1608 
1609    if (istwf_k == 2) then
1610      do ipw=1,npw_k*ndat,npw_k
1611        ug_ref(ipw) = REAL(ug_ref(ipw))
1612      end do
1613    end if
1614 
1615    ug = ug_ref
1616 
1617    call fft_ug(npw_k,nxyz,nspinor1,ndat,mgfft,ngfft,istwf_k,kg_k,gbound_k,ug,ur)
1618    call fft_ur(npw_k,nxyz,nspinor1,ndat,mgfft,ngfft,istwf_k,kg_k,gbound_k,ur,ug)
1619 
1620    ierr = COUNT(ABS(ug - ug_ref) > ATOL_DP)
1621    nfailed = nfailed + ierr
1622 
1623    write(info,"(a,i1,a)")sjoin(library,"fftu_dpc, istwfk "),istwf_k," :"
1624    if (ierr /= 0) then
1625      max_abserr = MAXVAL(ABS(ug - ug_ref))
1626      write(msg,"(a,es9.2,a)")" FAILED (max_abserr = ",max_abserr,")"
1627    else
1628      write(msg,"(a)")" OK"
1629    end if
1630    call wrtout(ount, sjoin(info, msg))
1631 
1632    ABI_FREE(kg_k)
1633    ABI_FREE(gbound_k)
1634  end do
1635 
1636  ABI_FREE(cg_ref)
1637  ABI_FREE(cg)
1638  ABI_FREE(cr)
1639  ABI_FREE(ug_ref)
1640  ABI_FREE(ug)
1641  ABI_FREE(ur)
1642  ABI_FREE(ug_refsp)
1643  ABI_FREE(ugsp)
1644  ABI_FREE(ursp)
1645  call destroy_mpi_enreg(MPI_enreg_seq)
1646 
1647  if (nthreads > 0) call xomp_set_num_threads(old_nthreads)
1648 
1649 end function fftu_utests

m_fft/fourdp_6d [ Functions ]

[ Top ] [ m_fft ] [ Functions ]

NAME

 fourdp_6d

FUNCTION

     Calculate a 6-dimensional Fast Fourier Transform

     isign=-1 : A(G1,G2) = Sum(r1,r2) A(r1,r2) exp(-iG1.r1) exp(+iG2.r2)
                                                  ^            ^
     isign=+1 : A(r1,r2) = Sum(G1,G2) A(G1,G2) exp(+iG1.r1) exp(-iG2.r2)
                                                  ^            ^
     isign=-1 and isign=1 form a transform/inverse-transform pair: calling
     one then the other will take you back to the original function,
     multiplied by a factor of (nl*nm*nn)**2.
     ------------------------------------------------------------------

     input:
      a: A(r1,r2) [overwritten]
     output:
      a: A(G1,G2) in the format IGFFT
     ------------------------------------------------------------------

INPUTS

OUTPUT

SIDE EFFECTS

NOTES

SOURCE

4295 subroutine fourdp_6d(cplex,matrix,isign,MPI_enreg,nfft,ngfft,tim_fourdp)
4296 
4297 !Arguments ------------------------------------
4298 !scalars
4299  integer,intent(in) :: cplex,isign,nfft,tim_fourdp
4300  type(MPI_type),intent(in) :: MPI_enreg
4301 !arrays
4302  integer,intent(in) :: ngfft(18)
4303  complex(gwpc),intent(inout) :: matrix(nfft,nfft)
4304 
4305 !Local variables-------------------------------
4306 !scalars
4307  !integer,parameter :: cplex=2
4308  integer :: i1,i2,i3,ifft
4309  integer :: n1,n2,n3
4310 !arrays
4311  real(dp),allocatable :: fofg(:,:),fofr(:)
4312 
4313 ! *************************************************************************
4314 
4315 !TODO check normalization factor, it is better if we use the GW conventions.
4316  n1 = ngfft(1)
4317  n2 = ngfft(2)
4318  n3 = ngfft(3)
4319 
4320  ABI_MALLOC(fofg,(2,nfft))
4321  ABI_MALLOC(fofr,(cplex*nfft))
4322 
4323  do i3=0,n3-1
4324    do i2=0,n2-1
4325      do i1=0,n1-1
4326 
4327        ifft=1+i1+i2*n1+i3*n1*n2
4328        if (isign==1) then
4329          ! G1 -> r1 transform for each G2 to form A(r1,G2)
4330          fofg(1,:)=REAL (matrix(:,ifft))
4331          fofg(2,:)=AIMAG(matrix(:,ifft))
4332        else if (isign==-1) then
4333          ! r1 -> G1 transform for each r2 to form A(G1,r2)
4334          fofr(1:nfft)       =REAL (matrix(:,ifft))
4335          fofr(nfft+1:2*nfft)=AIMAG(matrix(:,ifft))
4336        else
4337          ABI_ERROR("Wrong isign")
4338        end if
4339 
4340        call fourdp(cplex,fofg,fofr,isign,MPI_enreg,nfft,1,ngfft,tim_fourdp)
4341 
4342        if (isign==1) then ! Save A(r1,G2)
4343          matrix(:,ifft)=CMPLX(fofr(1:nfft),fofr(nfft+1:2*nfft))
4344        else if (isign==-1) then ! Save A(G1,r2)
4345          matrix(:,ifft)=CMPLX(fofg(1,:),fofg(2,:))
4346        end if
4347 
4348      end do
4349    end do
4350  end do
4351 
4352  do i3=0,n3-1
4353    do i2=0,n2-1
4354      do i1=0,n1-1
4355 
4356        ifft=1+i1+i2*n1+i3*n1*n2
4357        if (isign==1) then
4358          ! Do the G2 -> r2 transform of A(r1,G2) to get A(r1,r2)
4359          fofr(1:nfft       )=REAL (matrix(ifft,:))
4360          fofr(nfft+1:2*nfft)=AIMAG(matrix(ifft,:))
4361        else if (isign==-1) then
4362          ! Do the r2 -> G2 transform of A(G1,r2) to get A(G1,G2)
4363          fofg(1,:)=REAL (matrix(ifft,:))
4364          fofg(2,:)=AIMAG(matrix(ifft,:))
4365        end if
4366 
4367        call fourdp(2,fofg,fofr,-isign,MPI_enreg,nfft,1,ngfft,tim_fourdp)
4368 
4369        if (isign==1) then
4370          matrix(ifft,:)=CMPLX(fofg(1,:),fofg(2,:))
4371        else if (isign==-1) then
4372          matrix(ifft,:)=CMPLX(fofr(1:nfft),fofr(nfft+1:2*nfft))
4373        end if
4374 
4375      end do
4376    end do
4377  end do
4378 
4379  ABI_FREE(fofg)
4380  ABI_FREE(fofr)
4381 
4382 end subroutine fourdp_6d

m_fft/fourdp_mpi [ Functions ]

[ Top ] [ m_fft ] [ Functions ]

NAME

 fourdp_mpi

FUNCTION

 Conduct Fourier transform of REAL or COMPLEX function f(r)=fofr defined on
 fft grid in real space, to create complex f(G)=fofg defined on full fft grid
 in reciprocal space, in full storage mode, or the reverse operation.
 For the reverse operation, the final data is divided by nfftot.
 REAL case when cplex=1, COMPLEX case when cplex=2
 Usually used for density and potentials.

INPUTS

 cplex=1 if fofr is real, 2 if fofr is complex
 nfft=(effective) number of FFT grid points (for this processor)
 ngfft(18)=contain all needed information about 3D FFT, see ~abinit/doc/variables/vargs.htm#ngfft
 ndat=Numbre of FFT transforms
 isign=sign of Fourier transform exponent: current convention uses
    +1 for transforming from G to r
    -1 for transforming from r to G.
 fftn2_distrib(2),ffti2_local(2)
 fftn3_distrib(3),ffti3_local(3)
 comm_fft=MPI communicator

SIDE EFFECTS

 Input/Output
 fofg(2,nfft)=f(G), complex.
 fofr(cplex*nfft)=input function f(r) (real or complex)

TODO

  Write simplified API for sequential version.

SOURCE

3471 subroutine fourdp_mpi(cplex,nfft,ngfft,ndat,isign,&
3472 &  fftn2_distrib,ffti2_local,fftn3_distrib,ffti3_local,fofg,fofr,comm_fft)
3473 
3474 !Arguments ------------------------------------
3475 !scalars
3476  integer,intent(in) :: cplex,isign,nfft,ndat,comm_fft
3477 !arrays
3478  integer,intent(in) :: ngfft(18)
3479  integer,intent(in) :: fftn2_distrib(ngfft(2)),ffti2_local(ngfft(2))
3480  integer,intent(in) :: fftn3_distrib(ngfft(3)),ffti3_local(ngfft(3))
3481  real(dp),intent(inout) :: fofg(2,nfft*ndat),fofr(cplex*nfft*ndat)
3482 
3483 !Local variables-------------------------------
3484 !scalars
3485  integer :: fftalg,fftalga,fftalgc
3486  character(len=500) :: msg
3487 
3488 ! *************************************************************************
3489 
3490  fftalg=ngfft(7); fftalga=fftalg/100 ; fftalgc=mod(fftalg,10)
3491 
3492  select case (fftalga)
3493  case (FFT_SG2002)
3494    call sg2002_mpifourdp(cplex,nfft,ngfft,ndat,isign,&
3495      fftn2_distrib,ffti2_local,fftn3_distrib,ffti3_local,fofg,fofr,comm_fft)
3496 
3497  case (FFT_FFTW3)
3498    call fftw3_mpifourdp(cplex,nfft,ngfft,ndat,isign,&
3499      fftn2_distrib,ffti2_local,fftn3_distrib,ffti3_local,fofg,fofr,comm_fft)
3500 
3501  ! TODO
3502  !case (FFT_DFTI)
3503  !   call dfti_mpifourdp(cplex,nfft,ngfft,ndat,isign,&
3504  !&    fftn2_distrib,ffti2_local,fftn3_distrib,ffti3_local,fofg,fofr,comm_fft)
3505 
3506  case default
3507    write(msg,"(a,i0)")"Wrong fftalg: ",fftalg
3508    ABI_BUG(msg)
3509  end select
3510 
3511 end subroutine fourdp_mpi

m_fft/fourwf_mpi [ Functions ]

[ Top ] [ m_fft ] [ Functions ]

NAME

 fourwf_mpi

FUNCTION

 Carry out composite Fourier transforms between real and reciprocal (G) space.
 Wavefunctions, contained in a sphere in reciprocal space,
 can be FFT to real space. They can also be FFT from real space
 to a sphere. Also, the density maybe accumulated, and a local
 potential can be applied.

 The different options are :
 - reciprocal to real space and output the result (option=0),
 - reciprocal to real space and accumulate the density (option=1)
 - reciprocal to real space, apply the local potential to the wavefunction
    in real space and produce the result in reciprocal space (option=2)
 - real space to reciprocal space (option=3).

 Schedule of operations
 - fftalgc=1 : use separate forward and backward transforms
     (7/12 savings in execution time);
 - fftalgc=2 : in case of option=1 and option=2, use routines for composite operation
     even faster than 1x1

 Also for better speed, it uses no F90 construct, except the allocate command and for zeroing arrays.

INPUTS

 cplex= if 1 , denpot is real, if 2 , denpot is complex
    (cplex=2 only allowed for option=2, and istwf_k=1)
    not relevant if option=0 or option=3, so cplex=0 can be used to minimize memory
 fftalgc=1 or 2 => simple or composite FFT applications
 fofgin(2,npwin)=holds input wavefunction in G vector basis sphere.
                 (intent(in) but the routine sphere can modify it for another iflag)
 gboundin(2*mgfft+8,2)=sphere boundary info for reciprocal to real space
 gboundout(2*mgfft+8,2)=sphere boundary info for real to reciprocal space
 istwf_k=option parameter that describes the storage of wfs
 kg_kin(3,npwin)=reduced planewave coordinates, input
 kg_kout(3,npwout)=reduced planewave coordinates, output
 me_g0=1 if this MPI node treats the Gamma, 0 otherwise
 mgfft=maximum size of 1D FFTs
 ngfft(18)=contain all needed information about 3D FFT, see ~abinit/doc/variables/vargs.htm#ngfft
 distribfft<distribfft_type>=Tables needed for the FFT parallelism
 n1,n2,n3=1D FFT sizes
 npwin=number of elements in fofgin array (for option 0, 1 and 2)
 npwout=number of elements in fofgout array (for option 2 and 3)
 n4,n5,n6=dimensions of fofr.
 ndat=Number of FFTs
 option= if 0: do direct FFT
         if 1: do direct FFT, then sum the density
         if 2: do direct FFT, multiply by the potential, then do reverse FFT
         if 3: do reverse FFT only
 weight_r=weight to be used for the accumulation of the density in real space
         (needed only when option=1)
 weight_i=weight to be used for the accumulation of the density in real space
         (needed only when option=1)
 comm_fft=MPI communicator.
 [cplexwf]= 1 c2r or r2c can be used. Default: 2 i.e. use complex-to-complex FFTs.

OUTPUT

  (see side effects)

SIDE EFFECTS

 Input/Output
 for option==0, fofgin(2,npwin)=holds input wavefunction in G sphere;
                fofr(2,n4,n5,n6) contains the output Fourier Transform of fofgin;
                no use of denpot, fofgout and npwout.
 for option==1, fofgin(2,npwin)=holds input wavefunction in G sphere;
                denpot(cplex*n4,n5,n6) contains the input density at input,
                and the updated density at output (accumulated);
                no use of fofgout and npwout.
 for option==2, fofgin(2,npwin)=holds input wavefunction in G sphere;
                denpot(cplex*n4,n5,n6) contains the input local potential;
                fofgout(2,npwout) contains the output function;
 for option==3, fofr(2,n4,n5,n6) contains the input real space wavefunction;
                fofgout(2,npwout) contains its output Fourier transform;
                no use of fofgin and npwin.

SOURCE

3597 subroutine fourwf_mpi(cplex,denpot,fofgin,fofgout,fofr,&
3598 &  gboundin,gboundout,istwf_k,kg_kin,kg_kout,me_g0,mgfft,ngfft,distribfft,n1,n2,n3,&
3599 &  npwin,npwout,n4,n5,n6,ndat,option,weight_r,weight_i,comm_fft,cplexwf)
3600 
3601 
3602 !Arguments ------------------------------------
3603 !scalars
3604  integer,intent(in) :: cplex,istwf_k,mgfft,n1,n2,n3,n4,n5,n6,npwin,ndat
3605  integer,intent(in) :: npwout,option,comm_fft,me_g0
3606  integer,intent(in),optional :: cplexwf
3607  real(dp),intent(in) :: weight_r,weight_i
3608  type(distribfft_type),intent(in) :: distribfft
3609 !arrays
3610  integer,intent(in) :: gboundin(2*mgfft+8,2),gboundout(2*mgfft+8,2),ngfft(18)
3611  integer,intent(in) :: kg_kin(3,npwin),kg_kout(3,npwout)
3612  real(dp),intent(inout) :: denpot(cplex*n4,n5,n6),fofgin(2,npwin*ndat)
3613  real(dp),intent(inout) :: fofr(2,n4,n5,n6*ndat)
3614  real(dp),intent(out) :: fofgout(2,npwout*ndat)
3615 
3616 !Local variables-------------------------------
3617 !scalars
3618  integer :: fftalg,fftalga,fftalgc,idat
3619  integer :: cplexwf_,i1,i2,i3,iflag,ig,igdat,me_fft,m1i,m1o,m2i,m2o,m3i
3620  integer :: m3o,max1i,max1o,max2i,max2i_plus,max2o,max2o_plus
3621  integer :: max3i,max3o,md1,md1i,md1o,md2,md2i,md2o,md2proc
3622  integer :: md3,md3i,md3o,min1i,min1o,min2i,min2i_moins,min2o,min2o_moins,min3i,min3o
3623  integer :: nd3proc,nproc_fft,n6eff,i3_glob,i2_loc,i2dat_loc,i3dat
3624  integer,save :: nwrites_ialltoall=0
3625  logical :: use_ialltoall
3626  real(dp) :: fim,fre,xnorm
3627  character(len=500) :: msg
3628 !arrays
3629  integer,parameter :: shiftg0(3)=0
3630  integer,parameter :: symmE(3,3)=reshape([1,0,0,0,1,0,0,0,1],[3,3])
3631 ! real(dp) :: tsec(2)
3632  real(dp) :: weight_array_r(ndat), weight_array_i(ndat)
3633  real(dp),allocatable :: workf(:,:,:,:)
3634 
3635 ! *************************************************************************
3636 
3637  !call timab(540,1,tsec)
3638 
3639  fftalg=ngfft(7); fftalga=fftalg/100 ; fftalgc=mod(fftalg,10)
3640 
3641  me_fft=ngfft(11); nproc_fft=ngfft(10)
3642  !write(std_out,*)"nproc_fft",nproc_fft
3643  ! Risky since I don't know if this entry is initialized
3644  ! Continue to pass comm_fft explicitly.
3645  !comm_fft = ngfft(14)
3646 
3647  if (fftalgc<1 .or. fftalgc>2) then
3648    write(msg,'(a,i0,3a)')&
3649     'The input algorithm number fftalgc=',fftalgc,' is not allowed with MPI-FFT. Must be 1 or 2',ch10,&
3650     'Action: change fftalgc in your input file.'
3651    ABI_ERROR(msg)
3652  end if
3653 
3654  if (option<0 .or. option>3) then
3655    write(msg,'(a,i0,3a)')&
3656     'The option number',option,' is not allowed.',ch10,&
3657     'Only option=0, 1, 2 or 3 are allowed presently.'
3658    ABI_ERROR(msg)
3659  end if
3660 
3661  if (option==1 .and. cplex/=1) then
3662    write(msg,'(a,i0,a)')&
3663     'With the option number 1, cplex must be 1 but it is cplex=',cplex,'.'
3664    ABI_ERROR(msg)
3665  end if
3666 
3667  if ( ALL(cplex/=(/1,2/)) .and. ANY(option==(/1,2/)) ) then
3668    write(msg,'(a,i0,a)')' When option is (1,2) cplex must be 1 or 2, but it is cplex=',cplex,'.'
3669    ABI_ERROR(msg)
3670  end if
3671 
3672  !write(std_out,*)"in fourwf_mpi with fftalg: ",fftalg,fftalgc
3673 
3674 ! We use the non-blocking version if IALLTOALL is available and ndat > 1.
3675  use_ialltoall = .False.
3676 #ifdef HAVE_MPI_IALLTOALL
3677  use_ialltoall = (ndat > 1)
3678 #endif
3679  use_ialltoall = (use_ialltoall .and. ALLOW_IALLTOALL)
3680  if (use_ialltoall .and. nwrites_ialltoall==0) then
3681    nwrites_ialltoall = 1
3682    call wrtout(std_out, "- Will use non-blocking ialltoall for MPI-FFT")
3683  end if
3684 
3685  md1i=0; md2i=0; md3i=0; m2i=0
3686  md1o=0; md2o=0; md3o=0; m2o=0
3687 
3688  if (option/=3) then
3689    ! Compute the dimensions of the small-box enclosing the input G-sphere
3690    max1i=gboundin(2,1); min1i=gboundin(1,1)
3691    max2i=gboundin(4,1); min2i=gboundin(3,1)
3692 
3693    if(istwf_k==2 .or. istwf_k==4 .or. istwf_k==6 .or. istwf_k==8)then
3694      max1i=max(max1i,-min1i)
3695      min1i=-max1i
3696    else if (istwf_k==3 .or. istwf_k==5 .or. istwf_k==7 .or. istwf_k==9) then
3697      max1i=max(max1i,-min1i-1)
3698      min1i=-max1i-1
3699    end if
3700    if (istwf_k>=2 .and. istwf_k<=5) then
3701      max2i=max(max2i,-min2i)
3702      min2i=-max2i
3703    else if (istwf_k>=6 .and. istwf_k<=9) then
3704      max2i=max(max2i,-min2i-1)
3705      min2i=-max2i-1
3706    end if
3707 
3708    max3i=gboundin(4,2); min3i=gboundin(3,2)
3709 
3710    ! Compute arrays size and leading dimensions to avoid cache trashing
3711    m1i=max1i-min1i+1; md1i=2*(m1i/2)+1
3712    m2i=max2i-min2i+1; md2i=2*(m2i/2)+1
3713 
3714    !if (.False.) then
3715    if (nproc_fft/=1) then
3716      ! Increase max2i in order to have m2i divisible by nproc_fft
3717      min2i_moins=(((m2i-1)/nproc_fft+1)*nproc_fft-m2i)/2
3718      max2i_plus=((m2i-1)/nproc_fft+1)*nproc_fft-m2i-min2i_moins
3719      ! max2i=max2i+((m2i-1)/nproc_fft+1)*nproc_fft-m2i
3720      max2i=max2i+max2i_plus
3721      min2i=min2i-min2i_moins
3722      ! careful, to be checked and make sure the max and min are smaller than size of box
3723      m2i=max2i-min2i+1; md2i=2*(m2i/2)+1
3724    end if
3725    ABI_CHECK(m2i <= n2, "m2i > n2")
3726 
3727    m3i=max3i-min3i+1; md3i=2*(m3i/2)+1
3728  end if
3729 
3730  if (option==2 .or. option==3) then
3731    ! Compute the dimensions of the small-box enclosing the output G-sphere
3732    max1o=gboundout(2,1); min1o=gboundout(1,1)
3733    max2o=gboundout(4,1); min2o=gboundout(3,1)
3734 
3735    if (istwf_k==2 .or. istwf_k==4 .or. istwf_k==6 .or. istwf_k==8) then
3736      max1o=max(max1o,-min1o)
3737      min1o=-max1o
3738    else if (istwf_k==3 .or. istwf_k==5 .or. istwf_k==7 .or. istwf_k==9) then
3739      max1o=max(max1o,-min1o-1)
3740      min1o=-max1o-1
3741    end if
3742    if (istwf_k>=2 .and. istwf_k<=5) then
3743      max2o=max(max2o,-min2o)
3744      min2o=-max2o
3745    else if (istwf_k>=6 .and. istwf_k<=9) then
3746      max2o=max(max2o,-min2o-1)
3747      min2o=-max2o-1
3748    end if
3749 
3750    max3o=gboundout(4,2); min3o=gboundout(3,2)
3751 
3752    ! Compute arrays size and leading dimensions to avoid cache trashing
3753    m1o=max1o-min1o+1; md1o=2*(m1o/2)+1
3754    m2o=max2o-min2o+1; md2o=2*(m2o/2)+1
3755 
3756    if (nproc_fft/=1) then
3757      ! Increase max2o in order to have m2o divisible by nproc_fft
3758      min2o_moins=(((m2o-1)/nproc_fft+1)*nproc_fft-m2o)/2
3759      max2o_plus=((m2o-1)/nproc_fft+1)*nproc_fft-m2o-min2o_moins
3760      ! max2o=max2o+((m2o-1)/nproc_fft+1)*nproc_fft-m2o
3761      max2o=max2o+max2o_plus
3762      min2o=min2o-min2o_moins
3763      ! careful, to be checked and make sure the max and min are smaller than size of box
3764      m2o=max2o-min2o+1; md2o=2*(m2o/2)+1
3765    end if
3766    ABI_CHECK(m2o <= n2, "m2o > n2")
3767 
3768    m3o=max3o-min3o+1; md3o=2*(m3o/2)+1
3769  end if
3770 
3771  md1=max(md1i,md1o)
3772  md2=max(md2i,md2o)
3773  md3=max(md3i,md3o)
3774 
3775  md2proc=(max(m2i,m2o)-1)/nproc_fft+1
3776  n6eff=(n6-1)/nproc_fft+1
3777  !write(std_out,*)'fourwf_mpi : max1i,max2i,max3i=',max1i,max2i,max3i
3778  !write(std_out,*)'fourwf_mpi : min1i,min2i,min3i=',min1i,min2i,min3i
3779  !write(std_out,*)'fourwf_mpi : m1i,m2i,m3i=',m1i,m2i,m3i
3780 
3781  ! Allocate work array in G-space (note exchange 3 <--> 2)
3782  ABI_MALLOC(workf,(2,md1,md3,md2proc*ndat))
3783 
3784  if (option/=3) then
3785    ! Insert fofgin into the **small** box (array workf) :
3786    ! Note the switch of md3 and md2, as they are only needed to dimension workf inside "sphere"
3787 
3788    if (nproc_fft > 1) then
3789      if (istwf_k/=1 )then
3790        write(msg,'(a,i0,a)')'The value of istwf_k: ',istwf_k,' is not allowed. Only istwf_k=1 is allowed in MPI-FFT'
3791        !ABI_WARNING(msg)
3792        ABI_ERROR(msg)
3793      end if
3794      call sphere_fft1(fofgin,ndat,npwin,workf,m1i,m2i,m3i,md1,md3,md2proc,kg_kin,distribfft%tab_fftwf2_local)
3795    else
3796      iflag=2
3797      call sphere(fofgin,ndat,npwin,workf,m1i,m2i,m3i,md1,md3,md2proc,kg_kin,istwf_k,iflag,me_g0,shiftg0,symmE,one)
3798    end if
3799  end if
3800 
3801  ! Can we use c2r or r2c?
3802  cplexwf_=2; if (istwf_k==2) cplexwf_=1
3803  if (present(cplexwf)) cplexwf_ = cplexwf
3804 
3805  if (option==0 .or. ((option==1.or.option==2) .and. fftalgc==1) .or. option==3) then
3806    ! Treat non-composite operations
3807 
3808    if (option/=3) then
3809      ! Fourier transform workf(2,md1,md3,md2proc*ndat) to fofr (reciprocal to real space).
3810      ! FIXME: This is buggy if cplexwx==1
3811 
3812      select case (fftalga)
3813 
3814      case (FFT_SG2002)
3815 
3816 !        do idat=1,ndat
3817 !        call sg2002_mpiback_wf(cplexwf_,1,n1,n2,n3,n4,n5,(n6-1)/nproc_fft+1,&
3818 ! &        max1i,max2i,max3i,m1i,m2i,m3i,md1,md2proc,md3, &
3819 ! &         workf(:,:,:,(idat-1)*md2proc+1:idat*md2proc), &
3820 ! &        fofr(:,:,:,(idat-1)*n6eff+1:idat*n6eff),comm_fft)
3821 !        enddo
3822        call sg2002_mpiback_wf(cplexwf_,ndat,n1,n2,n3,n4,n5,(n6-1)/nproc_fft+1,&
3823          max1i,max2i,max3i,m1i,m2i,m3i,md1,md2proc,md3,workf,fofr,comm_fft)
3824 
3825      case (FFT_FFTW3)
3826 
3827        if (use_ialltoall) then
3828          call fftw3_mpiback_manywf(cplexwf_,ndat,n1,n2,n3,n4,n5,(n6-1)/nproc_fft+1,&
3829            max1i,max2i,max3i,m1i,m2i,m3i,md1,md2proc,md3,workf,fofr,comm_fft)
3830        else
3831          call fftw3_mpiback_wf(cplexwf_,ndat,n1,n2,n3,n4,n5,(n6-1)/nproc_fft+1,&
3832            max1i,max2i,max3i,m1i,m2i,m3i,md1,md2proc,md3,workf,fofr,comm_fft)
3833        end if
3834 
3835      !case (FFT_DFTI)
3836      !  call dfti_mpiback_wf(cplexwf_,ndat,n1,n2,n3,n4,n5,(n6-1)/nproc_fft+1,&
3837      !   &        max1i,max2i,max3i,m1i,m2i,m3i,md1,md2proc,md3,workf,fofr,comm_fft)
3838      case default
3839        ABI_ERROR("fftalga does not provide MPI back_wf")
3840      end select
3841 
3842    end if ! option
3843 
3844    nd3proc=(n3-1)/nproc_fft+1
3845 
3846    if (option==1) then
3847      ! Accumulate density
3848      do idat=1,ndat
3849        do i3=1,nd3proc
3850          i3_glob = i3 + nd3proc * me_fft
3851          i3dat = i3  + n6eff * (idat-1)
3852          do i2=1,n2
3853            do i1=1,n1
3854              denpot(i1,i2,i3_glob) = denpot(i1,i2,i3_glob) &
3855                + (weight_r*fofr(1,i1,i2,i3dat)**2+ weight_i*fofr(2,i1,i2,i3dat)**2)
3856            end do
3857          end do
3858        end do
3859      end do
3860    end if ! option==1
3861 
3862    if (option==2) then
3863 
3864      !  Apply local potential
3865      if (cplex==1) then
3866        do idat=1,ndat
3867          do i3=1,nd3proc
3868            i3_glob = i3 + nd3proc * me_fft
3869            i3dat = i3  + n6eff * (idat-1)
3870            do i2=1,n2
3871              do i1=1,n1
3872                fofr(1,i1,i2,i3dat)=denpot(i1,i2,i3_glob)*fofr(1,i1,i2,i3dat)
3873                fofr(2,i1,i2,i3dat)=denpot(i1,i2,i3_glob)*fofr(2,i1,i2,i3dat)
3874              end do
3875            end do
3876          end do
3877        end do
3878 
3879      else if (cplex==2) then
3880        do idat=1,ndat
3881          do i3=1,(n3-1)/nproc_fft+1
3882            i3_glob = i3 + nd3proc * me_fft
3883            i3dat = i3  + n6eff * (idat-1)
3884            do i2=1,n2
3885              do i1=1,n1
3886                fre=fofr(1,i1,i2,i3dat)
3887                fim=fofr(2,i1,i2,i3dat)
3888                fofr(1,i1,i2,i3dat)=denpot(2*i1-1,i2,i3_glob)*fre -denpot(2*i1,i2,i3_glob)*fim
3889                fofr(2,i1,i2,i3dat)=denpot(2*i1-1,i2,i3_glob)*fim +denpot(2*i1,i2,i3_glob)*fre
3890              end do
3891            end do
3892          end do
3893        end do
3894      end if ! cplex
3895 
3896    end if ! option==2
3897 
3898    if (option==2 .or. option==3) then
3899      ! Fourier transform fofr to workf (real to reciprocal space)
3900      ! output in workf(2,md1,md3,md2proc*ndat)
3901 
3902      select case (fftalga)
3903      case (FFT_SG2002)
3904 
3905        call sg2002_mpiforw_wf(cplexwf_,ndat,n1,n2,n3,n4,n5,(n6-1)/nproc_fft+1,&
3906         max1o,max2o,max3o,m1o,m2o,m3o,md1,md2proc,md3,fofr,workf,comm_fft)
3907 
3908      case (FFT_FFTW3)
3909 
3910        if (use_ialltoall) then
3911          call fftw3_mpiforw_manywf(cplexwf_,ndat,n1,n2,n3,n4,n5,(n6-1)/nproc_fft+1,&
3912           max1o,max2o,max3o,m1o,m2o,m3o,md1,md2proc,md3,fofr,workf,comm_fft)
3913        else
3914          call fftw3_mpiforw_wf(cplexwf_,ndat,n1,n2,n3,n4,n5,(n6-1)/nproc_fft+1,&
3915           max1o,max2o,max3o,m1o,m2o,m3o,md1,md2proc,md3,fofr,workf,comm_fft)
3916        end if
3917 
3918      !case (FFT_DFTI)
3919      !  call dfti_mpiforw_wf(cplexwf_,ndat,n1,n2,n3,n4,n5,(n6-1)/nproc_fft+1,&
3920      !  &max1o,max2o,max3o,m1o,m2o,m3o,md1,md2proc,md3,fofr,workf,comm_fft)
3921 
3922      case default
3923        ABI_ERROR("fftalga does not provide MPI back_wf")
3924      end select
3925 
3926    end if
3927 
3928  else if (fftalgc==2 .and. (option==1 .or. option==2)) then
3929    !  Treat composite operations
3930 
3931    select case (option)
3932    case (1)
3933      !ABI_CHECK(weight_r == weight_i,"weight_r != weight_i")
3934      weight_array_r(:)=weight_r
3935      weight_array_i(:)=weight_i
3936 
3937      select case (fftalga)
3938      case (FFT_SG2002)
3939        ! Note that here we don' fill fofr. Don't know if someone in
3940        ! abinit uses option 1 to get both fofr as well as denpot
3941        call sg2002_accrho(cplexwf_,ndat,n1,n2,n3,n4,n5,n6,(n6-1)/nproc_fft+1,&
3942          max1i,max2i,max3i,m1i,m2i,m3i,md1,md2proc,md3,comm_fft,nproc_fft,me_fft,&
3943          workf,denpot,weight_array_r,weight_array_i)
3944 
3945      case (FFT_FFTW3)
3946        call fftw3_accrho(cplexwf_,ndat,n1,n2,n3,n4,n5,n6,(n6-1)/nproc_fft+1,&
3947          max1i,max2i,max3i,m1i,m2i,m3i,md1,md2proc,md3,comm_fft,nproc_fft,me_fft,&
3948          workf,denpot,weight_array_r, weight_array_i)
3949 
3950      case default
3951        ABI_ERROR("fftalga does not provide accrho")
3952      end select
3953 
3954    case (2)
3955      !write(std_out,*)fftalg,option,cplex
3956      !ABI_CHECK(cplex==1,"cplex!=2 with fftalg 412 is buggy")
3957 
3958      select case (fftalga)
3959 
3960      case (FFT_SG2002)
3961 
3962        if (use_ialltoall) then
3963          call sg2002_applypot_many(cplexwf_,cplex,ndat,n1,n2,n3,n4,n5,n6,(n6-1)/nproc_fft+1,&
3964            max1i,max2i,max3i,m1i,m2i,m3i,md1,md2proc,md3,&
3965            max1o,max2o,max3o,m1o,m2o,m3o,comm_fft,nproc_fft,me_fft,denpot,workf)
3966 
3967        else
3968          call sg2002_applypot(cplexwf_,cplex,ndat,n1,n2,n3,n4,n5,n6,(n6-1)/nproc_fft+1,&
3969            max1i,max2i,max3i,m1i,m2i,m3i,md1,md2proc,md3,&
3970            max1o,max2o,max3o,m1o,m2o,m3o,comm_fft,nproc_fft,me_fft,denpot,workf)
3971        endif
3972 
3973      case (FFT_FFTW3)
3974 
3975        if (use_ialltoall) then
3976 
3977          call fftw3_applypot_many(cplexwf_,cplex,ndat,n1,n2,n3,n4,n5,n6,(n6-1)/nproc_fft+1,&
3978            max1i,max2i,max3i,m1i,m2i,m3i,md1,md2proc,md3,&
3979            max1o,max2o,max3o,m1o,m2o,m3o,comm_fft,nproc_fft,me_fft,denpot,workf)
3980 
3981        else
3982          call fftw3_applypot(cplexwf_,cplex,ndat,n1,n2,n3,n4,n5,n6,(n6-1)/nproc_fft+1,&
3983            max1i,max2i,max3i,m1i,m2i,m3i,md1,md2proc,md3,&
3984            max1o,max2o,max3o,m1o,m2o,m3o,comm_fft,nproc_fft,me_fft,denpot,workf)
3985        end if
3986 
3987      case default
3988        ABI_ERROR("fftalga does not provide applypot")
3989      end select
3990 
3991    case default
3992      write(msg,"(a,i0,a)")"Option ",option," is not supported when fftalgc == 2"
3993      ABI_ERROR(msg)
3994    end select
3995 
3996  end if ! End of composite operations
3997 
3998  if (option==2 .or. option==3) then
3999    ! From FFT box to kg_kout.
4000    xnorm=one/dble(n1*n2*n3)
4001 
4002    if (nproc_fft > 1) then
4003      do idat=1,ndat
4004        do ig=1,npwout
4005          i1=kg_kout(1,ig); if(i1<0) i1=i1+m1o; i1=i1+1
4006          i2=kg_kout(2,ig); if(i2<0) i2=i2+m2o; i2=i2+1
4007          i3=kg_kout(3,ig); if(i3<0) i3=i3+m3o; i3=i3+1
4008 
4009          igdat = ig + (idat-1) * npwout
4010          i2_loc = (i2-1)/nproc_fft +1
4011          i2dat_loc = i2_loc + (idat-1) * md2proc
4012 
4013          fofgout(1,igdat)=workf(1,i1,i3,i2dat_loc)*xnorm
4014          fofgout(2,igdat)=workf(2,i1,i3,i2dat_loc)*xnorm
4015        end do
4016      end do
4017    else
4018      ! Warning: This call is buggy if istwfk > 2
4019      iflag=-2
4020      call sphere(fofgout,ndat,npwout,workf,m1o,m2o,m3o,md1,md3,md2proc,kg_kout,istwf_k,iflag,&
4021        me_g0,shiftg0,symmE,xnorm)
4022    end if
4023  end if ! if option==2 or 3
4024 
4025  ABI_FREE(workf)
4026 
4027 !call timab(540,2,tsec)
4028 
4029 end subroutine fourwf_mpi

m_fft/indirect_parallel_Fourier [ Functions ]

[ Top ] [ m_fft ] [ Functions ]

NAME

 indirect_parallel_Fourier

FUNCTION

 The purpose of this routine is to transfer data from right to left right(:,index(i))=left(:,i)
 The difficulty is that right and left are distributed among processors
 We will suppose that the distribution is done as a density in Fourier space
 We first order the right hand side data according to the processor
 in which they are going to be located in the left hand side.
 This is done is a way such that  a mpi_alltoall put the data on the correct processor.
 We also transfer their future adress. A final ordering put everything in place

INPUTS

  index(sizeindex)= global adress for the transfer from right to left
  left(2,nleft)=left hand side
  mpi_enreg=information about MPI parallelization
  ngleft(18)=contain all needed information about 3D FFT for the left hand side
  see ~abinit/doc/variables/vargs.htm#ngfft
  ngright(18)=contain all needed information about 3D FFT for the right hand side
  see ~abinit/doc/variables/vargs.htm#ngfft
  nleft=second dimension of left array (for this processor)
  nright=second dimension of right array (for this processor)
  sizeindex=size of the index array (different form nright, because it is global to all proccessors)

OUTPUT

  left(2,nleft)=the elements of the right hand side, at the correct palce in the correct processor

NOTES

  A lot of things to improve.

SOURCE

4557 subroutine indirect_parallel_Fourier(index,left,mpi_enreg,ngleft,ngright,nleft,nright,paral_kgb,right,sizeindex)
4558 
4559 !Arguments ---------------------------------------------
4560 !scalars
4561  integer,intent(in) :: ngleft(18),ngright(18),nleft,nright,paral_kgb,sizeindex
4562  type(MPI_type),intent(in) :: mpi_enreg
4563 !arrays
4564  integer,intent(in) :: index(sizeindex)
4565  real(dp),intent(in) :: right(2,nright)
4566  real(dp),intent(inout) :: left(2,nleft)
4567 
4568 !Local variables ---------------------------------------
4569 !scalars
4570  integer :: ierr,i_global,ileft,iright,iright_global
4571  integer :: j,j1,j2,j3,j_global,jleft_global
4572  integer :: jleft_local,me_fft,n1l,n2l,n3l,n1r,n2r,n3r,nd2l,nd2r
4573  integer :: nproc_fft,proc_dest,r2,siz_slice_max
4574 !arrays
4575  integer,allocatable :: index_recv(:),index_send(:),siz_slice(:), ffti2r_global(:)
4576  integer, ABI_CONTIGUOUS pointer :: fftn2l_distrib(:),ffti2l_local(:)
4577  integer, ABI_CONTIGUOUS pointer :: fftn3l_distrib(:),ffti3l_local(:)
4578  integer, ABI_CONTIGUOUS pointer :: fftn2r_distrib(:),ffti2r_local(:)
4579  integer, ABI_CONTIGUOUS pointer :: fftn3r_distrib(:),ffti3r_local(:)
4580  real(dp),allocatable :: right_send(:,:),right_recv(:,:)
4581 
4582 ! *************************************************************************
4583  n1r=ngright(1);n2r=ngright(2);n3r=ngright(3)
4584  n1l=ngleft(1) ;n2l=ngleft(2) ;n3l=ngleft(3)
4585  nproc_fft=mpi_enreg%nproc_fft; me_fft=mpi_enreg%me_fft
4586  nd2r=n2r/nproc_fft; nd2l=n2l/nproc_fft
4587 
4588  !Get the distrib associated with the left fft_grid
4589  call ptabs_fourdp(mpi_enreg,n2l,n3l,fftn2l_distrib,ffti2l_local,fftn3l_distrib,ffti3l_local)
4590 
4591  !Get the distrib associated with the right fft_grid
4592  call ptabs_fourdp(mpi_enreg,n2r,n3r,fftn2r_distrib,ffti2r_local,fftn3r_distrib,ffti3r_local)
4593 
4594  !Precompute local --> global corespondance
4595  ABI_MALLOC(ffti2r_global,(nd2r))
4596  ffti2r_global(:) = -1
4597  do j2=1,n2r
4598     if( fftn2r_distrib(j2) == me_fft ) then
4599        ffti2r_global( ffti2r_local(j2) ) = j2
4600     end if
4601  end do
4602 
4603 
4604  ABI_MALLOC(siz_slice,(nproc_fft))
4605  siz_slice(:)=0
4606  do i_global=1,sizeindex !look for the maximal size of slice of data
4607   j_global=index(i_global)!; write(std_out,*) j_global,i_global
4608   if(j_global /=0) then
4609     !use the fact that (j-1)=i1 + n1l*(j2l-1 + n2l*(j3l-1))
4610    proc_dest= fftn2l_distrib( modulo((j_global-1)/n1l,n2l) + 1)
4611    siz_slice(proc_dest+1)=siz_slice(proc_dest+1)+1
4612 !write(std_out,*) 'in indirect proc',proc_dest,siz_slice(proc_dest+1)
4613   end if
4614  end do
4615  siz_slice_max=maxval(siz_slice) !This value could be made smaller by looking locally
4616 !and performing a allgather with a max
4617 !write(std_out,*) 'siz_slice,sizeindex,siz_slice',siz_slice(:),sizeindex,siz_slice_max
4618 !write(std_out,*) 'sizeindex,nright,nleft',sizeindex,nright,nleft
4619  ABI_MALLOC(right_send,(2,nproc_fft*siz_slice_max))
4620  ABI_MALLOC(index_send,(nproc_fft*siz_slice_max))
4621  siz_slice(:)=0; index_send(:)=0; right_send(:,:)=zero
4622  do iright=1,nright
4623   j=iright-1;j1=modulo(j,n1r);j2=modulo(j/n1r,nd2r);j3=j/(n1r*nd2r)
4624   j2 = ffti2r_global(j2+1) - 1
4625   iright_global=n1r*(n2r*j3+j2)+j1+1
4626   jleft_global=index(iright_global)
4627   if(jleft_global/=0)then
4628      j=jleft_global-1;j1=modulo(j,n1l);j2=modulo(j/n1l,n2l);j3=j/(n1l*n2l); r2=ffti2l_local(j2+1)-1
4629    jleft_local=n1l*(nd2l*j3+r2)+j1+1
4630    proc_dest=fftn2l_distrib(j2+1)
4631    siz_slice(proc_dest+1)=siz_slice(proc_dest+1)+1
4632    right_send(:,proc_dest*siz_slice_max+siz_slice(proc_dest+1))=right(:,iright)
4633    index_send(proc_dest*siz_slice_max+siz_slice(proc_dest+1))=jleft_local
4634 !write(std_out,*) 'loop ir',jleft_local,jleft_global,iright_global,iright
4635   end if
4636  end do
4637  ABI_MALLOC(right_recv,(2,nproc_fft*siz_slice_max))
4638  ABI_MALLOC(index_recv,(nproc_fft*siz_slice_max))
4639 #if defined HAVE_MPI
4640   if(paral_kgb == 1) then
4641     call mpi_alltoall (right_send,2*siz_slice_max, &
4642 &                          MPI_double_precision, &
4643 &                          right_recv,2*siz_slice_max, &
4644 &                          MPI_double_precision,mpi_enreg%comm_fft,ierr)
4645     call mpi_alltoall (index_send,siz_slice_max, &
4646 &                          MPI_integer, &
4647 &                          index_recv,siz_slice_max, &
4648 &                          MPI_integer,mpi_enreg%comm_fft,ierr)
4649   endif
4650 #endif
4651  do ileft=1,siz_slice_max*nproc_fft
4652 !write(std_out,*)index_recv(ileft)
4653  if(index_recv(ileft) /=0 ) left(:,index_recv(ileft))=right_recv(:,ileft)
4654  end do
4655  ABI_FREE(right_recv)
4656  ABI_FREE(index_recv)
4657  ABI_FREE(right_send)
4658  ABI_FREE(index_send)
4659  ABI_FREE(siz_slice)
4660  ABI_FREE(ffti2r_global)
4661 
4662 end subroutine indirect_parallel_Fourier

m_fft/uplan_execute_gr_dpc [ Functions ]

[ Top ] [ m_fft ] [ Functions ]

NAME

  uplan_execute_gr_dpc

FUNCTION

INPUTS

SOURCE

4884 subroutine uplan_execute_gr_dpc(uplan, ndat, ug, ur, isign, iscale)
4885 
4886 !Arguments ------------------------------------
4887  class(uplan_t),intent(in) :: uplan
4888  integer,intent(in) :: ndat
4889  complex(dp),intent(in) :: ug(uplan%npw*uplan%nspinor*ndat)
4890  complex(dp),intent(out) :: ur(uplan%nfft*uplan%nspinor*ndat)
4891  integer,optional,intent(in) :: isign, iscale
4892 
4893 !Local variables-------------------------------
4894  integer :: isign__, iscale__, nx, ny, nz, ldx, ldy, ldz, fftalg, fftalga, fftalgc, fftcache
4895 
4896 ! *************************************************************************
4897 
4898  ABI_CHECK_ILEQ(ndat, uplan%batch_size, "ndat > batch_size!")
4899  ABI_CHECK_IEQ(dp, uplan%kind, "Incosistent kind!")
4900 
4901  isign__ = +1; if (present(isign)) isign__ = isign
4902  iscale__ = 0; if (present(iscale)) iscale__ = iscale
4903 
4904  fftalg = uplan%ngfft(7); fftcache = uplan%ngfft(8); fftalga = fftalg/100; fftalgc = mod(fftalg, 10)
4905  nx = uplan%ngfft(1); ny = uplan%ngfft(2); nz = uplan%ngfft(3)
4906  ldx = nx; ldy = ny; ldz = nz ! No augmentation, the caller does not support it.
4907 
4908  if (uplan%gpu_option == ABI_GPU_DISABLED) then
4909    select case (fftalga)
4910    case (FFT_FFTW3)
4911      call fftw3_fftug(fftalg, fftcache, uplan%npw, nx, ny, nz, ldx, ldy, ldz, uplan%nspinor*ndat, &
4912                       uplan%istwfk, uplan%mgfft, uplan%kg_k, uplan%gbound, ug, ur, &
4913                       isign=isign__, iscale=iscale__)
4914    case (FFT_DFTI)
4915      call dfti_fftug(fftalg, fftcache, uplan%npw, nx, ny, nz, ldx, ldy, ldz, uplan%nspinor*ndat, &
4916                      uplan%istwfk, uplan%mgfft, uplan%kg_k, uplan%gbound, ug, ur, &
4917                      isign=isign__, iscale=iscale__)
4918    case default
4919      ABI_ERROR(sjoin("Wrong fftalga:", itoa(fftalga)))
4920    end select
4921 
4922  else
4923    NOT_IMPLEMENTED_ERROR()
4924  end if
4925 
4926 end subroutine uplan_execute_gr_dpc

m_fft/uplan_execute_gr_spc [ Functions ]

[ Top ] [ m_fft ] [ Functions ]

NAME

  uplan_execute_gr_spc

FUNCTION

INPUTS

SOURCE

4827 subroutine uplan_execute_gr_spc(uplan, ndat, ug, ur, isign, iscale)
4828 
4829 !Arguments ------------------------------------
4830  class(uplan_t),intent(in) :: uplan
4831  integer,intent(in) :: ndat
4832  complex(sp),intent(in) :: ug(uplan%npw*uplan%nspinor*ndat)
4833  complex(sp),intent(out) :: ur(uplan%nfft*uplan%nspinor*ndat)
4834  integer,optional,intent(in) :: isign, iscale
4835 
4836 !Local variables-------------------------------
4837  integer :: isign__, iscale__, nx, ny, nz, ldx, ldy, ldz, fftalg, fftalga, fftalgc, fftcache
4838 
4839 ! *************************************************************************
4840 
4841  ABI_CHECK_ILEQ(ndat, uplan%batch_size, "ndat > batch_size!")
4842  ABI_CHECK_IEQ(sp, uplan%kind, "Incosistent kind!")
4843 
4844  isign__ = +1; if (present(isign)) isign__ = isign
4845  iscale__ = 0; if (present(iscale)) iscale__ = iscale
4846 
4847  fftalg = uplan%ngfft(7); fftcache = uplan%ngfft(8); fftalga = fftalg/100; fftalgc = mod(fftalg, 10)
4848  nx = uplan%ngfft(1); ny = uplan%ngfft(2); nz = uplan%ngfft(3)
4849  ldx = nx; ldy = ny; ldz = nz ! No augmentation, the caller does not support it.
4850 
4851  if (uplan%gpu_option == ABI_GPU_DISABLED) then
4852    select case (fftalga)
4853    case (FFT_FFTW3)
4854      call fftw3_fftug(fftalg, fftcache, uplan%npw, nx, ny, nz, ldx, ldy, ldz, uplan%nspinor*ndat, &
4855                       uplan%istwfk, uplan%mgfft, uplan%kg_k, uplan%gbound, ug, ur, &
4856                       isign=isign__, iscale=iscale__)
4857    case (FFT_DFTI)
4858      call dfti_fftug(fftalg, fftcache, uplan%npw, nx, ny, nz, ldx, ldy, ldz, uplan%nspinor*ndat, &
4859                      uplan%istwfk, uplan%mgfft, uplan%kg_k, uplan%gbound, ug, ur, &
4860                      isign=isign__, iscale=iscale__)
4861    case default
4862      ABI_ERROR(sjoin("Wrong fftalga:", itoa(fftalga)))
4863    end select
4864 
4865  else
4866    NOT_IMPLEMENTED_ERROR()
4867  end if
4868 
4869 end subroutine uplan_execute_gr_spc

m_fft/uplan_execute_rg_dpc [ Functions ]

[ Top ] [ m_fft ] [ Functions ]

NAME

  uplan_execute_rg_dpc

FUNCTION

INPUTS

SOURCE

4996 subroutine uplan_execute_rg_dpc(uplan, ndat, ur, ug, isign, iscale)
4997 
4998 !Arguments ------------------------------------
4999  class(uplan_t),intent(in) :: uplan
5000  integer,intent(in) :: ndat
5001  complex(dp),intent(inout) :: ur(uplan%nfft*uplan%nspinor*ndat)
5002  complex(dp),intent(out) :: ug(uplan%npw*uplan%nspinor*ndat)
5003  integer,optional,intent(in) :: isign, iscale
5004 
5005 !Local variables-------------------------------
5006  integer :: isign__, iscale__, nx, ny, nz, ldx, ldy, ldz, fftalg, fftalga, fftalgc, fftcache
5007 ! *************************************************************************
5008 
5009  ABI_CHECK_ILEQ(ndat, uplan%batch_size, "ndat > batch_size!")
5010  ABI_CHECK_IEQ(dp, uplan%kind, "Incosistent kind!")
5011 
5012  isign__ = -1; if (present(isign)) isign__ = isign
5013  iscale__ = 1; if (present(iscale)) iscale__ = iscale
5014 
5015  fftalg = uplan%ngfft(7); fftcache = uplan%ngfft(8); fftalga = fftalg/100; fftalgc = mod(fftalg, 10)
5016  nx = uplan%ngfft(1); ny = uplan%ngfft(2); nz = uplan%ngfft(3)
5017  ldx = nx; ldy = ny; ldz = nz ! No augmentation, the caller does not support it.
5018 
5019  if (uplan%gpu_option == ABI_GPU_DISABLED) then
5020    select case (fftalga)
5021    case (FFT_FFTW3)
5022      call fftw3_fftur(fftalg, fftcache, uplan%npw, nx, ny, nz, ldx, ldy, ldz, uplan%nspinor*ndat, uplan%istwfk, uplan%mgfft, &
5023                       uplan%kg_k, uplan%gbound, ur, ug, isign=isign__, iscale=iscale__)
5024    case (FFT_DFTI)
5025      call dfti_fftur(fftalg, fftcache, uplan%npw, nx, ny, nz, ldx, ldy, ldz, uplan%nspinor*ndat, uplan%istwfk, uplan%mgfft, &
5026                      uplan%kg_k, uplan%gbound, ur, ug, isign=isign__, iscale=iscale__)
5027    case default
5028      ABI_ERROR(sjoin("Wrong fftalga:", itoa(fftalga)))
5029    end select
5030 
5031  else
5032    NOT_IMPLEMENTED_ERROR()
5033  end if
5034 
5035 end subroutine uplan_execute_rg_dpc

m_fft/uplan_execute_rg_spc [ Functions ]

[ Top ] [ m_fft ] [ Functions ]

NAME

  uplan_execute_rg_spc

FUNCTION

INPUTS

SOURCE

4941 subroutine uplan_execute_rg_spc(uplan, ndat, ur, ug, isign, iscale)
4942 
4943 !Arguments ------------------------------------
4944  class(uplan_t),intent(in) :: uplan
4945  integer,intent(in) :: ndat
4946  complex(sp),intent(inout) :: ur(uplan%nfft*uplan%nspinor*ndat)
4947  complex(sp),intent(out) :: ug(uplan%npw*uplan%nspinor*ndat)
4948  integer,optional,intent(in) :: isign, iscale
4949 
4950 !Local variables-------------------------------
4951  integer :: isign__, iscale__, nx, ny, nz, ldx, ldy, ldz, fftalg, fftalga, fftalgc, fftcache
4952 
4953 ! *************************************************************************
4954 
4955  ABI_CHECK_ILEQ(ndat, uplan%batch_size, "ndat > batch_size!")
4956  ABI_CHECK_IEQ(sp, uplan%kind, "Incosistent kind!")
4957 
4958  isign__ = -1; if (present(isign)) isign__ = isign
4959  iscale__ = 1; if (present(iscale)) iscale__ = iscale
4960 
4961  fftalg = uplan%ngfft(7); fftcache = uplan%ngfft(8); fftalga = fftalg/100; fftalgc = mod(fftalg, 10)
4962  nx = uplan%ngfft(1); ny = uplan%ngfft(2); nz = uplan%ngfft(3)
4963  ldx = nx; ldy = ny; ldz = nz ! No augmentation, the caller does not support it.
4964 
4965  if (uplan%gpu_option == ABI_GPU_DISABLED) then
4966    select case (fftalga)
4967    case (FFT_FFTW3)
4968      call fftw3_fftur(fftalg, fftcache, uplan%npw, nx, ny, nz, ldx, ldy, ldz, uplan%nspinor*ndat, uplan%istwfk, uplan%mgfft, &
4969                       uplan%kg_k, uplan%gbound, ur, ug, isign=isign__, iscale=iscale__)
4970    case (FFT_DFTI)
4971      call dfti_fftur(fftalg, fftcache, uplan%npw, nx, ny, nz, ldx, ldy, ldz, uplan%nspinor*ndat, uplan%istwfk, uplan%mgfft, &
4972                      uplan%kg_k, uplan%gbound, ur, ug, isign=isign__, iscale=iscale__)
4973    case default
4974      ABI_ERROR(sjoin("Wrong fftalga:", itoa(fftalga)))
4975    end select
4976 
4977  else
4978    NOT_IMPLEMENTED_ERROR()
4979  end if
4980 
4981 end subroutine uplan_execute_rg_spc

m_fft/uplan_free [ Functions ]

[ Top ] [ m_fft ] [ Functions ]

NAME

  uplan_free

FUNCTION

INPUTS

SOURCE

4801 subroutine uplan_free(uplan)
4802 
4803 !Arguments ------------------------------------
4804  class(uplan_t),intent(inout) :: uplan
4805 ! *************************************************************************
4806 
4807  ABI_SFREE(uplan%gbound)
4808  if (uplan%gpu_option /= ABI_GPU_DISABLED) then
4809    ! Free memory on the GPU
4810  end if
4811 
4812 end subroutine uplan_free

m_fft/uplan_init [ Functions ]

[ Top ] [ m_fft ] [ Functions ]

NAME

  uplan_init

FUNCTION

INPUTS

SOURCE

4755 subroutine uplan_init(uplan, npw, nspinor, batch_size, ngfft, istwfk, kg_k, kind, gpu_option)
4756 
4757 !Arguments ------------------------------------
4758 !scalars
4759  class(uplan_t),intent(out) :: uplan
4760  integer,intent(in) :: npw, nspinor, batch_size, istwfk, kind, gpu_option
4761 !arrays
4762  integer,intent(in) :: ngfft(18)
4763  integer,target,intent(in) :: kg_k(3,npw)
4764 
4765 ! *************************************************************************
4766 
4767  uplan%npw = npw
4768  uplan%nspinor = nspinor
4769  uplan%istwfk = istwfk
4770  uplan%batch_size = batch_size
4771  uplan%kind  = kind
4772  uplan%gpu_option  = gpu_option
4773  uplan%ngfft = ngfft
4774  uplan%mgfft = maxval(ngfft(1:3))
4775  uplan%nfft  = product(ngfft(1:3))
4776  uplan%kg_k => kg_k
4777 
4778  ABI_MALLOC(uplan%gbound, (2 * uplan%mgfft + 8, 2))
4779  call sphereboundary(uplan%gbound, uplan%istwfk, uplan%kg_k, uplan%mgfft, uplan%npw)
4780 
4781  if (uplan%gpu_option /= ABI_GPU_DISABLED) then
4782    ! Allocate memory on the device and transfer data.
4783    NOT_IMPLEMENTED_ERROR()
4784  end if
4785 
4786 end subroutine uplan_init

m_fft/uplan_t [ Types ]

[ Top ] [ m_fft ] [ Types ]

NAME

 uplan_t

FUNCTION

SOURCE

207  type, public :: uplan_t
208 
209    integer :: npw = -1
210    integer :: nspinor = -1
211    integer :: batch_size = -1  ! MAXIMUM number of FFTs associated to the plan.
212    integer :: istwfk = -1
213    integer :: kind = -1
214    integer :: gpu_option = ABI_GPU_DISABLED  ! /= 0 if FFTs should be offloaded to the GPU.
215    integer :: nfft = -1  ! Total number of points in the FFT box.
216    integer :: mgfft = -1
217    integer :: ngfft(18)
218    integer, contiguous, pointer :: kg_k(:,:)
219    integer, allocatable :: gbound(:,:)
220 
221  contains
222    procedure :: init => uplan_init    ! Build object
223    procedure :: free => uplan_free    ! Free dynamic memory
224 
225    procedure :: execute_gr_spc => uplan_execute_gr_spc
226    procedure :: execute_gr_dpc => uplan_execute_gr_dpc
227    procedure :: execute_rg_spc => uplan_execute_rg_spc
228    procedure :: execute_rg_dpc => uplan_execute_rg_dpc
229 
230    ! Main entry point for performing FFTs on the full box
231    ! complex-to-complex version, operating on complex arrays
232    generic :: execute_gr => execute_gr_spc, &
233                             execute_gr_dpc
234 
235    generic :: execute_rg => execute_rg_spc, &
236                             execute_rg_dpc
237  end type uplan_t

m_fft/zerosym [ Functions ]

[ Top ] [ m_fft ] [ Functions ]

NAME

 zerosym

FUNCTION

 Symmetrize an array on the FFT grid by vanishing some term on the boundaries.

INPUTS

  cplex= if 1, input array is REAL, if 2, input array is COMPLEX
  mpi_enreg=information about MPI parallelization
  n1,n2,n3=FFT dimensions nfft=n1*n2*n3
  ig1,ig2,ig3=optional arguments= indexes of unbalanced g-vectors to cancel
              if not present, ig1=1+n1/2, ig2=1+n2/2, ig3=1+n3/2 for even n1,n2,n3
              if igj=-1, nothing is done in direction j

OUTPUT

  (see side effects)

SIDE EFFECTS

  array(cplex,n1*n2*n3)=complex array to be symetrized

SOURCE

4120 subroutine zerosym(array,cplex,n1,n2,n3, &
4121                    ig1,ig2,ig3,comm_fft,distribfft) ! Optional arguments
4122 
4123 
4124 !Arguments ------------------------------------
4125 !scalars
4126  integer,intent(in) :: cplex,n1,n2,n3
4127  integer,optional,intent(in) :: ig1,ig2,ig3,comm_fft
4128  type(distribfft_type),intent(in),target,optional :: distribfft
4129 !arrays
4130  real(dp),intent(inout) :: array(cplex,n1*n2*n3)
4131 
4132 !Local variables-------------------------------
4133 !scalars
4134  integer :: i1,i2,i3,ifft,ifft_proc,index,j,j1,j2,j3,me_fft,nd2
4135  integer :: nproc_fft,n1sel,nn12,n2sel,n3sel,r2
4136  !arrays
4137  integer, ABI_CONTIGUOUS pointer :: fftn2_distrib(:),ffti2_local(:)
4138 
4139 ! **********************************************************************
4140 
4141  DBG_ENTER("COLL")
4142 
4143  me_fft=0;nproc_fft=1
4144  if (present(comm_fft)) then
4145    me_fft=xmpi_comm_rank(comm_fft)
4146    nproc_fft=xmpi_comm_size(comm_fft)
4147  end if
4148  nd2=(n2-1)/nproc_fft+1
4149  nn12=n1*n2
4150 
4151 !Get the distrib associated with this fft_grid
4152  if (present(distribfft)) then
4153    if (n2== distribfft%n2_coarse) then
4154      fftn2_distrib => distribfft%tab_fftdp2_distrib
4155      ffti2_local => distribfft%tab_fftdp2_local
4156    else if(n2 == distribfft%n2_fine) then
4157      fftn2_distrib => distribfft%tab_fftdp2dg_distrib
4158      ffti2_local => distribfft%tab_fftdp2dg_local
4159    else
4160      ABI_BUG("Unable to find an allocated distrib for this fft grid")
4161    end if
4162  else
4163    ABI_MALLOC(fftn2_distrib,(n2))
4164    ABI_MALLOC(ffti2_local,(n2))
4165    fftn2_distrib=0;ffti2_local=(/(i2,i2=1,n2)/)
4166  end if
4167 
4168  if (present(ig1)) then
4169    n1sel=ig1
4170  else if (mod(n1,2)==0) then
4171    n1sel=1+n1/2
4172  else
4173    n1sel=-1
4174  end if
4175  if (present(ig2)) then
4176    n2sel=ig2
4177  else if (mod(n2,2)==0) then
4178    n2sel=1+n2/2
4179  else
4180    n2sel=-1
4181  end if
4182  if (present(ig3)) then
4183    n3sel=ig3
4184  else if (mod(n3,2)==0) then
4185    n3sel=1+n3/2
4186  else
4187    n3sel=-1
4188  end if
4189 
4190  if (n1sel>0) then
4191    index=n1sel-nn12-n1
4192    do i3=1,n3
4193      index=index+nn12;ifft=index
4194      do i2=1,n2
4195        ifft=ifft+n1
4196        if (nproc_fft>1) then
4197          ! MPIWF: consider ifft only if it is treated by the current proc and compute its adress
4198          j=ifft-1;j1=modulo(j,n1);j2=modulo(j/n1,n2);j3=j/(n1*n2) !;r2=modulo(j2,nd2)
4199          if(fftn2_distrib(j2+1)==me_fft) then ! MPIWF this ifft is to be treated by me_fft
4200            r2= ffti2_local(j2+1) - 1
4201            ifft_proc=n1*(nd2*j3+r2)+j1+1 !this is ifft in the current proc
4202            array(:,ifft_proc)=zero
4203          end if
4204        else
4205          array(:,ifft)=zero
4206        end if
4207      end do
4208    end do
4209  end if
4210 
4211  if (n2sel>0) then
4212    index=n1*n2sel-nn12-n1
4213    do i3=1,n3
4214      index=index+nn12;ifft=index
4215      do i1=1,n1
4216        ifft=ifft+1
4217        if (nproc_fft>1) then
4218          ! MPIWF: consider ifft only if it is treated by the current proc and compute its adress
4219          j=ifft-1;j1=modulo(j,n1);j2=modulo(j/n1,n2);j3=j/(n1*n2);
4220          if(fftn2_distrib(j2+1)==me_fft) then ! MPIWF this ifft is to be treated by me_fft
4221            r2= ffti2_local(j2+1) - 1
4222            ifft_proc=n1*(nd2*j3+r2)+j1+1 !this is ifft in the current proc
4223            array(:,ifft_proc)=zero
4224          end if
4225        else
4226          array(:,ifft)=zero
4227        end if
4228      end do
4229    end do
4230  end if
4231 
4232  if (n3sel>0) then
4233    index=nn12*n3sel-nn12-n1
4234    do i2=1,n2
4235      index=index+n1;ifft=index
4236      do i1=1,n1
4237        ifft=ifft+1
4238        if (nproc_fft>1) then
4239          ! MPIWF: consider ifft only if it is treated by the current proc and compute its adress
4240          j=ifft-1;j1=modulo(j,n1);j2=modulo(j/n1,n2);j3=j/(n1*n2)
4241          if(fftn2_distrib(j2+1)==me_fft) then ! MPIWF this ifft is to be treated by me_fft
4242            r2= ffti2_local(j2+1) - 1
4243            ifft_proc=n1*(nd2*j3+r2)+j1+1 !this is ifft in the current proc
4244            array(:,ifft_proc)=zero
4245          end if
4246        else
4247          array(:,ifft)=zero
4248        end if
4249      end do
4250    end do
4251  end if
4252 
4253  if (.not.present(distribfft)) then
4254    ABI_FREE(fftn2_distrib)
4255    ABI_FREE(ffti2_local)
4256  end if
4257 
4258  DBG_EXIT("COLL")
4259 
4260 end subroutine zerosym