TABLE OF CONTENTS


ABINIT/m_sg2002 [ Modules ]

[ Top ] [ Modules ]

NAME

  m_sg2002

FUNCTION

COPYRIGHT

  Copyright (C) 2002-2007 Stefan Goedecker, CEA Grenoble
  Copyright (C) 2014-2018 ABINIT group (XG)
  This file is distributed under the terms of the
  GNU General Public License, see ~abinit/COPYING
  or http://www.gnu.org/copyleft/gpl.txt .

NOTES

PARENTS

CHILDREN

SOURCE

23 #if defined HAVE_CONFIG_H
24 #include "config.h"
25 #endif
26 
27 #include "abi_common.h"
28 
29 module m_sg2002
30 
31  use defs_basis
32  use defs_fftdata
33  use m_abicore
34  use m_errors
35  use m_xmpi
36 
37  use m_time,         only : timab
38  use m_fstrings,     only : itoa
39  use m_fftcore,      only : sphere_fft1, fill, scramble, switchreal, switch, mpiswitch,&
40 &                           unfill, unscramble, unswitchreal, unswitch, unmpiswitch,&
41 &                           fill_cent, switch_cent, switchreal_cent, mpiswitch_cent, multpot, addrho,&
42 &                           unfill_cent, unswitchreal_cent, unswitch_cent, unmpiswitch_cent, unscramble,&
43 &                           mpifft_fg2dbox, mpifft_dbox2fr, mpifft_fr2dbox, mpifft_dbox2fg
44 
45  implicit none
46 
47  private
48 
49  ! Public API:
50  !public :: sg2002_seqfourdp   ! seq-FFT of densities and potentials.
51  public :: sg2002_mpifourdp    ! MPI-FFT of densities and potentials.
52  !public :: mpi_fourwf
53  !public :: sg2002_seqfourwf   ! seq-FFT of wavefunctions.
54  !public :: sg2002_mpifourwf   ! MPI-FFT of wavefunctions.
55 
56 ! Low-level tools.
57 ! These procedure shouls be accessed via a wrapper that selected the library via fftalg
58  public :: sg2002_back           ! G --> R for densities and potentials
59  public :: sg2002_forw           ! R --> G for densities and potentials
60  public :: sg2002_mpiback_wf     ! G --> R for wavefunctions
61  public :: sg2002_mpiforw_wf     ! R --> G for wavefunctions
62  public :: sg2002_applypot       ! Compute <G|vloc|u> where u is given in reciprocal space.
63  public :: sg2002_applypot_many  ! Compute <G|vloc|u> where u is given in reciprocal space.
64  public :: sg2002_accrho         ! Compute rho = weigth_r*Re(u(r))**2 + weigth_i*Im(u(r))**2
65 
66 contains

m_sg2002/ctrig [ Functions ]

[ Top ] [ m_sg2002 ] [ Functions ]

NAME

  ctrig

FUNCTION

INPUTS

OUTPUT

PARENTS

      m_sg2002

CHILDREN

SOURCE

2619 subroutine ctrig(n,trig,after,before,now,isign,ic)
2620 
2621 
2622 !This section has been created automatically by the script Abilint (TD).
2623 !Do not modify the following lines by hand.
2624 #undef ABI_FUNC
2625 #define ABI_FUNC 'ctrig'
2626 !End of the abilint section
2627 
2628  implicit none
2629 
2630 !Arguments ------------------------------------
2631  integer,intent(in) :: n,isign
2632  integer,intent(inout) :: ic
2633  integer,intent(inout) :: after(mdata),before(mdata),now(mdata)
2634  real(dp),intent(inout) :: trig(2,n)
2635 
2636 !Local variables-------------------------------
2637 !scalars
2638  integer :: i,itt,j,nh
2639  real(dp) :: angle,trigc,trigs
2640 
2641 ! *************************************************************************
2642 
2643  do i=1,ndata
2644    if (n.eq.ifftdata(1,i)) then
2645      ic=0
2646      do j=1,(mdata-1)
2647        itt=ifftdata(1+j,i)
2648        if (itt.gt.1) then
2649          ic=ic+1
2650          now(j)=ifftdata(1+j,i)
2651        else
2652          goto 1000
2653        end if
2654      end do
2655      goto 1000
2656    end if
2657  end do
2658 
2659  write(std_out,*) 'VALUE OF',n,'NOT ALLOWED FOR FFT, ALLOWED VALUES ARE:'
2660 37 format(15(i5))
2661  write(std_out,37) (ifftdata(1,i),i=1,ndata)
2662  MSG_ERROR("Aborting now")
2663 
2664 1000 continue
2665  after(1)=1
2666  before(ic)=1
2667  do i=2,ic
2668    after(i)=after(i-1)*now(i-1)
2669    before(ic-i+1)=before(ic-i+2)*now(ic-i+2)
2670  end do
2671 
2672  angle=isign*two_pi/n
2673  if (mod(n,2).eq.0) then
2674    nh=n/2
2675    trig(1,1)=one
2676    trig(2,1)=zero
2677    trig(1,nh+1)=-one
2678    trig(2,nh+1)=zero
2679    do i=1,nh-1
2680      trigc=cos(i*angle)
2681      trigs=sin(i*angle)
2682      trig(1,i+1)=trigc
2683      trig(2,i+1)=trigs
2684      trig(1,n-i+1)=trigc
2685      trig(2,n-i+1)=-trigs
2686    end do
2687  else
2688    nh=(n-1)/2
2689    trig(1,1)=one
2690    trig(2,1)=zero
2691    do i=1,nh
2692      trigc=cos(i*angle)
2693      trigs=sin(i*angle)
2694      trig(1,i+1)=trigc
2695      trig(2,i+1)=trigs
2696      trig(1,n-i+1)=trigc
2697      trig(2,n-i+1)=-trigs
2698    end do
2699  end if
2700 
2701 end subroutine ctrig

m_sg2002/fftstp [ Functions ]

[ Top ] [ m_sg2002 ] [ Functions ]

NAME

  fftstp

FUNCTION

INPUTS

   mm
   n1dfft
   m
   nn
   n
   zin
   trig
   after
   now
   before
   isign

OUTPUT

   zout

PARENTS

      m_sg2002

CHILDREN

SOURCE

2733 subroutine fftstp(mm,n1dfft,m,nn,n,zin,zout,trig,after,now,before,isign)
2734 
2735 
2736 !This section has been created automatically by the script Abilint (TD).
2737 !Do not modify the following lines by hand.
2738 #undef ABI_FUNC
2739 #define ABI_FUNC 'fftstp'
2740 !End of the abilint section
2741 
2742  implicit none
2743 
2744 !Arguments ------------------------------------
2745  integer,intent(in) :: after,before,mm,n1dfft,m,nn,n,now,isign
2746  real(dp),intent(in) :: trig(2,n),zin(2,mm,m)
2747  real(dp),intent(inout) :: zout(2,nn,n)
2748 
2749 !Local variables-------------------------------
2750  integer :: atn,atb,ia,ias,ib,itrig,itt,j,nin1,nin2,nin3,nin4,nin5,nin6,nin7,nin8
2751  integer :: nout1,nout2,nout3,nout4,nout5,nout6,nout7,nout8
2752  real(dp) :: am,ap,bm,bp,ci3,ci4,ci5,ci6,ci7,ci8,cm,cos2,cos4,cp,cr2,cr3,cr4,cr5,cr6,cr7,cr8
2753  real(dp) :: dm,bb,ci2,dpp,r,r2,r25,r3,r34,r4,r5,r6,r7,r8,rt2i,s,r1,s1,s2,s3,s25,s34,s4,s5,s6,s7,s8
2754  real(dp) :: sin2,ui1,ui2,ui3,ur1,ur2,ur3,sin4,vi1,vi2,vi3,vr1,vr2,vr3
2755 
2756 ! *************************************************************************
2757         atn=after*now
2758         atb=after*before
2759 
2760 !         sqrt(.5d0)
2761         rt2i=half_sqrt2
2762         if (now.eq.2) then
2763         ia=1
2764         nin1=ia-after
2765         nout1=ia-atn
2766         do ib=1,before
2767           nin1=nin1+after
2768           nin2=nin1+atb
2769           nout1=nout1+atn
2770           nout2=nout1+after
2771             do j=1,n1dfft
2772             r1=zin(1,j,nin1)
2773             s1=zin(2,j,nin1)
2774             r2=zin(1,j,nin2)
2775             s2=zin(2,j,nin2)
2776             zout(1,j,nout1)= r2 + r1
2777             zout(2,j,nout1)= s2 + s1
2778             zout(1,j,nout2)= r1 - r2
2779             zout(2,j,nout2)= s1 - s2
2780           enddo
2781         enddo
2782         do 2000,ia=2,after
2783         ias=ia-1
2784         if (2*ias.eq.after) then
2785                 if (isign.eq.1) then
2786                         nin1=ia-after
2787                         nout1=ia-atn
2788                         do ib=1,before
2789                           nin1=nin1+after
2790                           nin2=nin1+atb
2791                           nout1=nout1+atn
2792                           nout2=nout1+after
2793                           do j=1,n1dfft
2794                             r1=zin(1,j,nin1)
2795                             s1=zin(2,j,nin1)
2796                             r2=zin(2,j,nin2)
2797                             s2=zin(1,j,nin2)
2798                             zout(1,j,nout1)= r1 - r2
2799                             zout(2,j,nout1)= s2 + s1
2800                             zout(1,j,nout2)= r2 + r1
2801                             zout(2,j,nout2)= s1 - s2
2802                           enddo
2803                         enddo
2804                 else
2805                         nin1=ia-after
2806                         nout1=ia-atn
2807                         do ib=1,before
2808                           nin1=nin1+after
2809                           nin2=nin1+atb
2810                           nout1=nout1+atn
2811                           nout2=nout1+after
2812                             do j=1,n1dfft
2813                             r1=zin(1,j,nin1)
2814                             s1=zin(2,j,nin1)
2815                             r2=zin(2,j,nin2)
2816                             s2=zin(1,j,nin2)
2817                             zout(1,j,nout1)= r2 + r1
2818                             zout(2,j,nout1)= s1 - s2
2819                             zout(1,j,nout2)= r1 - r2
2820                             zout(2,j,nout2)= s2 + s1
2821                           enddo
2822                         enddo
2823                 end if
2824         else if (4*ias.eq.after) then
2825                 if (isign.eq.1) then
2826                         nin1=ia-after
2827                         nout1=ia-atn
2828                         do ib=1,before
2829                         nin1=nin1+after
2830                         nin2=nin1+atb
2831                         nout1=nout1+atn
2832                         nout2=nout1+after
2833                         do j=1,n1dfft
2834                         r1=zin(1,j,nin1)
2835                         s1=zin(2,j,nin1)
2836                         r=zin(1,j,nin2)
2837                         s=zin(2,j,nin2)
2838                         r2=(r - s)*rt2i
2839                         s2=(r + s)*rt2i
2840                         zout(1,j,nout1)= r2 + r1
2841                         zout(2,j,nout1)= s2 + s1
2842                         zout(1,j,nout2)= r1 - r2
2843                         zout(2,j,nout2)= s1 - s2
2844                         enddo
2845                         enddo
2846                 else
2847                         nin1=ia-after
2848                         nout1=ia-atn
2849                         do ib=1,before
2850                         nin1=nin1+after
2851                         nin2=nin1+atb
2852                         nout1=nout1+atn
2853                         nout2=nout1+after
2854                         do j=1,n1dfft
2855                         r1=zin(1,j,nin1)
2856                         s1=zin(2,j,nin1)
2857                         r=zin(1,j,nin2)
2858                         s=zin(2,j,nin2)
2859                         r2=(r + s)*rt2i
2860                         s2=(s - r)*rt2i
2861                         zout(1,j,nout1)= r2 + r1
2862                         zout(2,j,nout1)= s2 + s1
2863                         zout(1,j,nout2)= r1 - r2
2864                         zout(2,j,nout2)= s1 - s2
2865                         enddo
2866                         enddo
2867                 end if
2868         else if (4*ias.eq.3*after) then
2869                 if (isign.eq.1) then
2870                         nin1=ia-after
2871                         nout1=ia-atn
2872                         do ib=1,before
2873                         nin1=nin1+after
2874                         nin2=nin1+atb
2875                         nout1=nout1+atn
2876                         nout2=nout1+after
2877                         do j=1,n1dfft
2878                         r1=zin(1,j,nin1)
2879                         s1=zin(2,j,nin1)
2880                         r=zin(1,j,nin2)
2881                         s=zin(2,j,nin2)
2882                         r2=(r + s)*rt2i
2883                         s2=(r - s)*rt2i
2884                         zout(1,j,nout1)= r1 - r2
2885                         zout(2,j,nout1)= s2 + s1
2886                         zout(1,j,nout2)= r2 + r1
2887                         zout(2,j,nout2)= s1 - s2
2888                         enddo
2889                         enddo
2890                 else
2891                         nin1=ia-after
2892                         nout1=ia-atn
2893                         do ib=1,before
2894                         nin1=nin1+after
2895                         nin2=nin1+atb
2896                         nout1=nout1+atn
2897                         nout2=nout1+after
2898                         do j=1,n1dfft
2899                         r1=zin(1,j,nin1)
2900                         s1=zin(2,j,nin1)
2901                         r=zin(1,j,nin2)
2902                         s=zin(2,j,nin2)
2903                         r2=(s - r)*rt2i
2904                         s2=(r + s)*rt2i
2905                         zout(1,j,nout1)= r2 + r1
2906                         zout(2,j,nout1)= s1 - s2
2907                         zout(1,j,nout2)= r1 - r2
2908                         zout(2,j,nout2)= s2 + s1
2909                         enddo
2910                         enddo
2911                 end if
2912         else
2913                 itrig=ias*before+1
2914                 cr2=trig(1,itrig)
2915                 ci2=trig(2,itrig)
2916                 nin1=ia-after
2917                 nout1=ia-atn
2918                 do ib=1,before
2919                 nin1=nin1+after
2920                 nin2=nin1+atb
2921                 nout1=nout1+atn
2922                 nout2=nout1+after
2923                 do j=1,n1dfft
2924                 r1=zin(1,j,nin1)
2925                 s1=zin(2,j,nin1)
2926                 r=zin(1,j,nin2)
2927                 s=zin(2,j,nin2)
2928                 r2=r*cr2 - s*ci2
2929                 s2=r*ci2 + s*cr2
2930                 zout(1,j,nout1)= r2 + r1
2931                 zout(2,j,nout1)= s2 + s1
2932                 zout(1,j,nout2)= r1 - r2
2933                 zout(2,j,nout2)= s1 - s2
2934                 enddo
2935                 enddo
2936         end if
2937 2000        continue
2938         else if (now.eq.4) then
2939         if (isign.eq.1) then
2940                 ia=1
2941                 nin1=ia-after
2942                 nout1=ia-atn
2943                 do ib=1,before
2944                 nin1=nin1+after
2945                 nin2=nin1+atb
2946                 nin3=nin2+atb
2947                 nin4=nin3+atb
2948                 nout1=nout1+atn
2949                 nout2=nout1+after
2950                 nout3=nout2+after
2951                 nout4=nout3+after
2952                 do j=1,n1dfft
2953                 r1=zin(1,j,nin1)
2954                 s1=zin(2,j,nin1)
2955                 r2=zin(1,j,nin2)
2956                 s2=zin(2,j,nin2)
2957                 r3=zin(1,j,nin3)
2958                 s3=zin(2,j,nin3)
2959                 r4=zin(1,j,nin4)
2960                 s4=zin(2,j,nin4)
2961                 r=r1 + r3
2962                 s=r2 + r4
2963                 zout(1,j,nout1) = r + s
2964                 zout(1,j,nout3) = r - s
2965                 r=r1 - r3
2966                 s=s2 - s4
2967                 zout(1,j,nout2) = r - s
2968                 zout(1,j,nout4) = r + s
2969                 r=s1 + s3
2970                 s=s2 + s4
2971                 zout(2,j,nout1) = r + s
2972                 zout(2,j,nout3) = r - s
2973                 r=s1 - s3
2974                 s=r2 - r4
2975                 zout(2,j,nout2) = r + s
2976                 zout(2,j,nout4) = r - s
2977                 enddo
2978                 enddo
2979                 do 4000,ia=2,after
2980                 ias=ia-1
2981                 if (2*ias.eq.after) then
2982                         nin1=ia-after
2983                         nout1=ia-atn
2984                         do ib=1,before
2985                         nin1=nin1+after
2986                         nin2=nin1+atb
2987                         nin3=nin2+atb
2988                         nin4=nin3+atb
2989                         nout1=nout1+atn
2990                         nout2=nout1+after
2991                         nout3=nout2+after
2992                         nout4=nout3+after
2993                         do j=1,n1dfft
2994                         r1=zin(1,j,nin1)
2995                         s1=zin(2,j,nin1)
2996                         r=zin(1,j,nin2)
2997                         s=zin(2,j,nin2)
2998                         r2=(r-s)*rt2i
2999                         s2=(r+s)*rt2i
3000                         r3=zin(2,j,nin3)
3001                         s3=zin(1,j,nin3)
3002                         r=zin(1,j,nin4)
3003                         s=zin(2,j,nin4)
3004                         r4=(r + s)*rt2i
3005                         s4=(r - s)*rt2i
3006                         r=r1 - r3
3007                         s=r2 - r4
3008                         zout(1,j,nout1) = r + s
3009                         zout(1,j,nout3) = r - s
3010                         r=r1 + r3
3011                         s=s2 - s4
3012                         zout(1,j,nout2) = r - s
3013                         zout(1,j,nout4) = r + s
3014                         r=s1 + s3
3015                         s=s2 + s4
3016                         zout(2,j,nout1) = r + s
3017                         zout(2,j,nout3) = r - s
3018                         r=s1 - s3
3019                         s=r2 + r4
3020                         zout(2,j,nout2) = r + s
3021                         zout(2,j,nout4) = r - s
3022                         enddo
3023                         enddo
3024                 else
3025                         itt=ias*before
3026                         itrig=itt+1
3027                         cr2=trig(1,itrig)
3028                         ci2=trig(2,itrig)
3029                         itrig=itrig+itt
3030                         cr3=trig(1,itrig)
3031                         ci3=trig(2,itrig)
3032                         itrig=itrig+itt
3033                         cr4=trig(1,itrig)
3034                         ci4=trig(2,itrig)
3035                         nin1=ia-after
3036                         nout1=ia-atn
3037                         do ib=1,before
3038                         nin1=nin1+after
3039                         nin2=nin1+atb
3040                         nin3=nin2+atb
3041                         nin4=nin3+atb
3042                         nout1=nout1+atn
3043                         nout2=nout1+after
3044                         nout3=nout2+after
3045                         nout4=nout3+after
3046                         do j=1,n1dfft
3047                         r1=zin(1,j,nin1)
3048                         s1=zin(2,j,nin1)
3049                         r=zin(1,j,nin2)
3050                         s=zin(2,j,nin2)
3051                         r2=r*cr2 - s*ci2
3052                         s2=r*ci2 + s*cr2
3053                         r=zin(1,j,nin3)
3054                         s=zin(2,j,nin3)
3055                         r3=r*cr3 - s*ci3
3056                         s3=r*ci3 + s*cr3
3057                         r=zin(1,j,nin4)
3058                         s=zin(2,j,nin4)
3059                         r4=r*cr4 - s*ci4
3060                         s4=r*ci4 + s*cr4
3061                         r=r1 + r3
3062                         s=r2 + r4
3063                         zout(1,j,nout1) = r + s
3064                         zout(1,j,nout3) = r - s
3065                         r=r1 - r3
3066                         s=s2 - s4
3067                         zout(1,j,nout2) = r - s
3068                         zout(1,j,nout4) = r + s
3069                         r=s1 + s3
3070                         s=s2 + s4
3071                         zout(2,j,nout1) = r + s
3072                         zout(2,j,nout3) = r - s
3073                         r=s1 - s3
3074                         s=r2 - r4
3075                         zout(2,j,nout2) = r + s
3076                         zout(2,j,nout4) = r - s
3077                         enddo
3078                         enddo
3079                 end if
3080 4000                continue
3081         else
3082                 ia=1
3083                 nin1=ia-after
3084                 nout1=ia-atn
3085                 do ib=1,before
3086                 nin1=nin1+after
3087                 nin2=nin1+atb
3088                 nin3=nin2+atb
3089                 nin4=nin3+atb
3090                 nout1=nout1+atn
3091                 nout2=nout1+after
3092                 nout3=nout2+after
3093                 nout4=nout3+after
3094                 do j=1,n1dfft
3095                 r1=zin(1,j,nin1)
3096                 s1=zin(2,j,nin1)
3097                 r2=zin(1,j,nin2)
3098                 s2=zin(2,j,nin2)
3099                 r3=zin(1,j,nin3)
3100                 s3=zin(2,j,nin3)
3101                 r4=zin(1,j,nin4)
3102                 s4=zin(2,j,nin4)
3103                 r=r1 + r3
3104                 s=r2 + r4
3105                 zout(1,j,nout1) = r + s
3106                 zout(1,j,nout3) = r - s
3107                 r=r1 - r3
3108                 s=s2 - s4
3109                 zout(1,j,nout2) = r + s
3110                 zout(1,j,nout4) = r - s
3111                 r=s1 + s3
3112                 s=s2 + s4
3113                 zout(2,j,nout1) = r + s
3114                 zout(2,j,nout3) = r - s
3115                 r=s1 - s3
3116                 s=r2 - r4
3117                 zout(2,j,nout2) = r - s
3118                 zout(2,j,nout4) = r + s
3119                 enddo
3120                 enddo
3121                 do 4100,ia=2,after
3122                 ias=ia-1
3123                 if (2*ias.eq.after) then
3124                         nin1=ia-after
3125                         nout1=ia-atn
3126                         do ib=1,before
3127                         nin1=nin1+after
3128                         nin2=nin1+atb
3129                         nin3=nin2+atb
3130                         nin4=nin3+atb
3131                         nout1=nout1+atn
3132                         nout2=nout1+after
3133                         nout3=nout2+after
3134                         nout4=nout3+after
3135                         do j=1,n1dfft
3136                         r1=zin(1,j,nin1)
3137                         s1=zin(2,j,nin1)
3138                         r=zin(1,j,nin2)
3139                         s=zin(2,j,nin2)
3140                         r2=(r + s)*rt2i
3141                         s2=(s - r)*rt2i
3142                         r3=zin(2,j,nin3)
3143                         s3=zin(1,j,nin3)
3144                         r=zin(1,j,nin4)
3145                         s=zin(2,j,nin4)
3146                         r4=(s - r)*rt2i
3147                         s4=(r + s)*rt2i
3148                         r=r1 + r3
3149                         s=r2 + r4
3150                         zout(1,j,nout1) = r + s
3151                         zout(1,j,nout3) = r - s
3152                         r=r1 - r3
3153                         s=s2 + s4
3154                         zout(1,j,nout2) = r + s
3155                         zout(1,j,nout4) = r - s
3156                         r=s1 - s3
3157                         s=s2 - s4
3158                         zout(2,j,nout1) = r + s
3159                         zout(2,j,nout3) = r - s
3160                         r=s1 + s3
3161                         s=r2 - r4
3162                         zout(2,j,nout2) = r - s
3163                         zout(2,j,nout4) = r + s
3164                         enddo
3165                         enddo
3166                 else
3167                         itt=ias*before
3168                         itrig=itt+1
3169                         cr2=trig(1,itrig)
3170                         ci2=trig(2,itrig)
3171                         itrig=itrig+itt
3172                         cr3=trig(1,itrig)
3173                         ci3=trig(2,itrig)
3174                         itrig=itrig+itt
3175                         cr4=trig(1,itrig)
3176                         ci4=trig(2,itrig)
3177                         nin1=ia-after
3178                         nout1=ia-atn
3179                         do ib=1,before
3180                         nin1=nin1+after
3181                         nin2=nin1+atb
3182                         nin3=nin2+atb
3183                         nin4=nin3+atb
3184                         nout1=nout1+atn
3185                         nout2=nout1+after
3186                         nout3=nout2+after
3187                         nout4=nout3+after
3188                         do j=1,n1dfft
3189                         r1=zin(1,j,nin1)
3190                         s1=zin(2,j,nin1)
3191                         r=zin(1,j,nin2)
3192                         s=zin(2,j,nin2)
3193                         r2=r*cr2 - s*ci2
3194                         s2=r*ci2 + s*cr2
3195                         r=zin(1,j,nin3)
3196                         s=zin(2,j,nin3)
3197                         r3=r*cr3 - s*ci3
3198                         s3=r*ci3 + s*cr3
3199                         r=zin(1,j,nin4)
3200                         s=zin(2,j,nin4)
3201                         r4=r*cr4 - s*ci4
3202                         s4=r*ci4 + s*cr4
3203                         r=r1 + r3
3204                         s=r2 + r4
3205                         zout(1,j,nout1) = r + s
3206                         zout(1,j,nout3) = r - s
3207                         r=r1 - r3
3208                         s=s2 - s4
3209                         zout(1,j,nout2) = r + s
3210                         zout(1,j,nout4) = r - s
3211                         r=s1 + s3
3212                         s=s2 + s4
3213                         zout(2,j,nout1) = r + s
3214                         zout(2,j,nout3) = r - s
3215                         r=s1 - s3
3216                         s=r2 - r4
3217                         zout(2,j,nout2) = r - s
3218                         zout(2,j,nout4) = r + s
3219                         enddo
3220                         enddo
3221                 end if
3222 4100                continue
3223         end if
3224         else if (now.eq.8) then
3225         if (isign.eq.-1) then
3226                 ia=1
3227                         nin1=ia-after
3228                         nout1=ia-atn
3229                         do ib=1,before
3230                         nin1=nin1+after
3231                         nin2=nin1+atb
3232                         nin3=nin2+atb
3233                         nin4=nin3+atb
3234                         nin5=nin4+atb
3235                         nin6=nin5+atb
3236                         nin7=nin6+atb
3237                         nin8=nin7+atb
3238                         nout1=nout1+atn
3239                         nout2=nout1+after
3240                         nout3=nout2+after
3241                         nout4=nout3+after
3242                         nout5=nout4+after
3243                         nout6=nout5+after
3244                         nout7=nout6+after
3245                         nout8=nout7+after
3246                         do j=1,n1dfft
3247                         r1=zin(1,j,nin1)
3248                         s1=zin(2,j,nin1)
3249                         r2=zin(1,j,nin2)
3250                         s2=zin(2,j,nin2)
3251                         r3=zin(1,j,nin3)
3252                         s3=zin(2,j,nin3)
3253                         r4=zin(1,j,nin4)
3254                         s4=zin(2,j,nin4)
3255                         r5=zin(1,j,nin5)
3256                         s5=zin(2,j,nin5)
3257                         r6=zin(1,j,nin6)
3258                         s6=zin(2,j,nin6)
3259                         r7=zin(1,j,nin7)
3260                         s7=zin(2,j,nin7)
3261                         r8=zin(1,j,nin8)
3262                         s8=zin(2,j,nin8)
3263                         r=r1 + r5
3264                         s=r3 + r7
3265                         ap=r + s
3266                         am=r - s
3267                         r=r2 + r6
3268                         s=r4 + r8
3269                         bp=r + s
3270                         bm=r - s
3271                         r=s1 + s5
3272                         s=s3 + s7
3273                         cp=r + s
3274                         cm=r - s
3275                         r=s2 + s6
3276                         s=s4 + s8
3277                         dpp=r + s
3278                         dm=r - s
3279                         zout(1,j,nout1) = ap + bp
3280                         zout(2,j,nout1) = cp + dpp
3281                         zout(1,j,nout5) = ap - bp
3282                         zout(2,j,nout5) = cp - dpp
3283                         zout(1,j,nout3) = am + dm
3284                         zout(2,j,nout3) = cm - bm
3285                         zout(1,j,nout7) = am - dm
3286                         zout(2,j,nout7) = cm + bm
3287                         r=r1 - r5
3288                         s=s3 - s7
3289                         ap=r + s
3290                         am=r - s
3291                         r=s1 - s5
3292                         s=r3 - r7
3293                         bp=r + s
3294                         bm=r - s
3295                         r=s4 - s8
3296                         s=r2 - r6
3297                         cp=r + s
3298                         cm=r - s
3299                         r=s2 - s6
3300                         s=r4 - r8
3301                         dpp=r + s
3302                         dm=r - s
3303                         r = ( cp + dm)*rt2i
3304                         s = ( dm - cp)*rt2i
3305                         cp= ( cm + dpp)*rt2i
3306                         dpp = ( cm - dpp)*rt2i
3307                         zout(1,j,nout2) = ap + r
3308                         zout(2,j,nout2) = bm + s
3309                         zout(1,j,nout6) = ap - r
3310                         zout(2,j,nout6) = bm - s
3311                         zout(1,j,nout4) = am + cp
3312                         zout(2,j,nout4) = bp + dpp
3313                         zout(1,j,nout8) = am - cp
3314                         zout(2,j,nout8) = bp - dpp
3315                         enddo
3316                         enddo
3317                 do 8000,ia=2,after
3318                 ias=ia-1
3319                         itt=ias*before
3320                         itrig=itt+1
3321                         cr2=trig(1,itrig)
3322                         ci2=trig(2,itrig)
3323                         itrig=itrig+itt
3324                         cr3=trig(1,itrig)
3325                         ci3=trig(2,itrig)
3326                         itrig=itrig+itt
3327                         cr4=trig(1,itrig)
3328                         ci4=trig(2,itrig)
3329                         itrig=itrig+itt
3330                         cr5=trig(1,itrig)
3331                         ci5=trig(2,itrig)
3332                         itrig=itrig+itt
3333                         cr6=trig(1,itrig)
3334                         ci6=trig(2,itrig)
3335                         itrig=itrig+itt
3336                         cr7=trig(1,itrig)
3337                         ci7=trig(2,itrig)
3338                         itrig=itrig+itt
3339                         cr8=trig(1,itrig)
3340                         ci8=trig(2,itrig)
3341                         nin1=ia-after
3342                         nout1=ia-atn
3343                         do ib=1,before
3344                         nin1=nin1+after
3345                         nin2=nin1+atb
3346                         nin3=nin2+atb
3347                         nin4=nin3+atb
3348                         nin5=nin4+atb
3349                         nin6=nin5+atb
3350                         nin7=nin6+atb
3351                         nin8=nin7+atb
3352                         nout1=nout1+atn
3353                         nout2=nout1+after
3354                         nout3=nout2+after
3355                         nout4=nout3+after
3356                         nout5=nout4+after
3357                         nout6=nout5+after
3358                         nout7=nout6+after
3359                         nout8=nout7+after
3360                         do j=1,n1dfft
3361                         r1=zin(1,j,nin1)
3362                         s1=zin(2,j,nin1)
3363                         r=zin(1,j,nin2)
3364                         s=zin(2,j,nin2)
3365                         r2=r*cr2 - s*ci2
3366                         s2=r*ci2 + s*cr2
3367                         r=zin(1,j,nin3)
3368                         s=zin(2,j,nin3)
3369                         r3=r*cr3 - s*ci3
3370                         s3=r*ci3 + s*cr3
3371                         r=zin(1,j,nin4)
3372                         s=zin(2,j,nin4)
3373                         r4=r*cr4 - s*ci4
3374                         s4=r*ci4 + s*cr4
3375                         r=zin(1,j,nin5)
3376                         s=zin(2,j,nin5)
3377                         r5=r*cr5 - s*ci5
3378                         s5=r*ci5 + s*cr5
3379                         r=zin(1,j,nin6)
3380                         s=zin(2,j,nin6)
3381                         r6=r*cr6 - s*ci6
3382                         s6=r*ci6 + s*cr6
3383                         r=zin(1,j,nin7)
3384                         s=zin(2,j,nin7)
3385                         r7=r*cr7 - s*ci7
3386                         s7=r*ci7 + s*cr7
3387                         r=zin(1,j,nin8)
3388                         s=zin(2,j,nin8)
3389                         r8=r*cr8 - s*ci8
3390                         s8=r*ci8 + s*cr8
3391                         r=r1 + r5
3392                         s=r3 + r7
3393                         ap=r + s
3394                         am=r - s
3395                         r=r2 + r6
3396                         s=r4 + r8
3397                         bp=r + s
3398                         bm=r - s
3399                         r=s1 + s5
3400                         s=s3 + s7
3401                         cp=r + s
3402                         cm=r - s
3403                         r=s2 + s6
3404                         s=s4 + s8
3405                         dpp=r + s
3406                         dm=r - s
3407                         zout(1,j,nout1) = ap + bp
3408                         zout(2,j,nout1) = cp + dpp
3409                         zout(1,j,nout5) = ap - bp
3410                         zout(2,j,nout5) = cp - dpp
3411                         zout(1,j,nout3) = am + dm
3412                         zout(2,j,nout3) = cm - bm
3413                         zout(1,j,nout7) = am - dm
3414                         zout(2,j,nout7) = cm + bm
3415                         r=r1 - r5
3416                         s=s3 - s7
3417                         ap=r + s
3418                         am=r - s
3419                         r=s1 - s5
3420                         s=r3 - r7
3421                         bp=r + s
3422                         bm=r - s
3423                         r=s4 - s8
3424                         s=r2 - r6
3425                         cp=r + s
3426                         cm=r - s
3427                         r=s2 - s6
3428                         s=r4 - r8
3429                         dpp=r + s
3430                         dm=r - s
3431                         r = ( cp + dm)*rt2i
3432                         s = ( dm - cp)*rt2i
3433                         cp= ( cm + dpp)*rt2i
3434                         dpp = ( cm - dpp)*rt2i
3435                         zout(1,j,nout2) = ap + r
3436                         zout(2,j,nout2) = bm + s
3437                         zout(1,j,nout6) = ap - r
3438                         zout(2,j,nout6) = bm - s
3439                         zout(1,j,nout4) = am + cp
3440                         zout(2,j,nout4) = bp + dpp
3441                         zout(1,j,nout8) = am - cp
3442                         zout(2,j,nout8) = bp - dpp
3443                         enddo
3444                         enddo
3445 8000                continue
3446 
3447         else
3448                 ia=1
3449                         nin1=ia-after
3450                         nout1=ia-atn
3451                         do ib=1,before
3452                         nin1=nin1+after
3453                         nin2=nin1+atb
3454                         nin3=nin2+atb
3455                         nin4=nin3+atb
3456                         nin5=nin4+atb
3457                         nin6=nin5+atb
3458                         nin7=nin6+atb
3459                         nin8=nin7+atb
3460                         nout1=nout1+atn
3461                         nout2=nout1+after
3462                         nout3=nout2+after
3463                         nout4=nout3+after
3464                         nout5=nout4+after
3465                         nout6=nout5+after
3466                         nout7=nout6+after
3467                         nout8=nout7+after
3468                         do j=1,n1dfft
3469                         r1=zin(1,j,nin1)
3470                         s1=zin(2,j,nin1)
3471                         r2=zin(1,j,nin2)
3472                         s2=zin(2,j,nin2)
3473                         r3=zin(1,j,nin3)
3474                         s3=zin(2,j,nin3)
3475                         r4=zin(1,j,nin4)
3476                         s4=zin(2,j,nin4)
3477                         r5=zin(1,j,nin5)
3478                         s5=zin(2,j,nin5)
3479                         r6=zin(1,j,nin6)
3480                         s6=zin(2,j,nin6)
3481                         r7=zin(1,j,nin7)
3482                         s7=zin(2,j,nin7)
3483                         r8=zin(1,j,nin8)
3484                         s8=zin(2,j,nin8)
3485                         r=r1 + r5
3486                         s=r3 + r7
3487                         ap=r + s
3488                         am=r - s
3489                         r=r2 + r6
3490                         s=r4 + r8
3491                         bp=r + s
3492                         bm=r - s
3493                         r=s1 + s5
3494                         s=s3 + s7
3495                         cp=r + s
3496                         cm=r - s
3497                         r=s2 + s6
3498                         s=s4 + s8
3499                         dpp=r + s
3500                         dm=r - s
3501                         zout(1,j,nout1) = ap + bp
3502                         zout(2,j,nout1) = cp + dpp
3503                         zout(1,j,nout5) = ap - bp
3504                         zout(2,j,nout5) = cp - dpp
3505                         zout(1,j,nout3) = am - dm
3506                         zout(2,j,nout3) = cm + bm
3507                         zout(1,j,nout7) = am + dm
3508                         zout(2,j,nout7) = cm - bm
3509                         r= r1 - r5
3510                         s=-s3 + s7
3511                         ap=r + s
3512                         am=r - s
3513                         r=s1 - s5
3514                         s=r7 - r3
3515                         bp=r + s
3516                         bm=r - s
3517                         r=-s4 + s8
3518                         s= r2 - r6
3519                         cp=r + s
3520                         cm=r - s
3521                         r=-s2 + s6
3522                         s= r4 - r8
3523                         dpp=r + s
3524                         dm=r - s
3525                         r = ( cp + dm)*rt2i
3526                         s = ( cp - dm)*rt2i
3527                         cp= ( cm + dpp)*rt2i
3528                         dpp= ( dpp - cm)*rt2i
3529                         zout(1,j,nout2) = ap + r
3530                         zout(2,j,nout2) = bm + s
3531                         zout(1,j,nout6) = ap - r
3532                         zout(2,j,nout6) = bm - s
3533                         zout(1,j,nout4) = am + cp
3534                         zout(2,j,nout4) = bp + dpp
3535                         zout(1,j,nout8) = am - cp
3536                         zout(2,j,nout8) = bp - dpp
3537                         enddo
3538                         enddo
3539 
3540                 do 8001,ia=2,after
3541                 ias=ia-1
3542                         itt=ias*before
3543                         itrig=itt+1
3544                         cr2=trig(1,itrig)
3545                         ci2=trig(2,itrig)
3546                         itrig=itrig+itt
3547                         cr3=trig(1,itrig)
3548                         ci3=trig(2,itrig)
3549                         itrig=itrig+itt
3550                         cr4=trig(1,itrig)
3551                         ci4=trig(2,itrig)
3552                         itrig=itrig+itt
3553                         cr5=trig(1,itrig)
3554                         ci5=trig(2,itrig)
3555                         itrig=itrig+itt
3556                         cr6=trig(1,itrig)
3557                         ci6=trig(2,itrig)
3558                         itrig=itrig+itt
3559                         cr7=trig(1,itrig)
3560                         ci7=trig(2,itrig)
3561                         itrig=itrig+itt
3562                         cr8=trig(1,itrig)
3563                         ci8=trig(2,itrig)
3564                         nin1=ia-after
3565                         nout1=ia-atn
3566                         do ib=1,before
3567                         nin1=nin1+after
3568                         nin2=nin1+atb
3569                         nin3=nin2+atb
3570                         nin4=nin3+atb
3571                         nin5=nin4+atb
3572                         nin6=nin5+atb
3573                         nin7=nin6+atb
3574                         nin8=nin7+atb
3575                         nout1=nout1+atn
3576                         nout2=nout1+after
3577                         nout3=nout2+after
3578                         nout4=nout3+after
3579                         nout5=nout4+after
3580                         nout6=nout5+after
3581                         nout7=nout6+after
3582                         nout8=nout7+after
3583                         do j=1,n1dfft
3584                         r1=zin(1,j,nin1)
3585                         s1=zin(2,j,nin1)
3586                         r=zin(1,j,nin2)
3587                         s=zin(2,j,nin2)
3588                         r2=r*cr2 - s*ci2
3589                         s2=r*ci2 + s*cr2
3590                         r=zin(1,j,nin3)
3591                         s=zin(2,j,nin3)
3592                         r3=r*cr3 - s*ci3
3593                         s3=r*ci3 + s*cr3
3594                         r=zin(1,j,nin4)
3595                         s=zin(2,j,nin4)
3596                         r4=r*cr4 - s*ci4
3597                         s4=r*ci4 + s*cr4
3598                         r=zin(1,j,nin5)
3599                         s=zin(2,j,nin5)
3600                         r5=r*cr5 - s*ci5
3601                         s5=r*ci5 + s*cr5
3602                         r=zin(1,j,nin6)
3603                         s=zin(2,j,nin6)
3604                         r6=r*cr6 - s*ci6
3605                         s6=r*ci6 + s*cr6
3606                         r=zin(1,j,nin7)
3607                         s=zin(2,j,nin7)
3608                         r7=r*cr7 - s*ci7
3609                         s7=r*ci7 + s*cr7
3610                         r=zin(1,j,nin8)
3611                         s=zin(2,j,nin8)
3612                         r8=r*cr8 - s*ci8
3613                         s8=r*ci8 + s*cr8
3614                         r=r1 + r5
3615                         s=r3 + r7
3616                         ap=r + s
3617                         am=r - s
3618                         r=r2 + r6
3619                         s=r4 + r8
3620                         bp=r + s
3621                         bm=r - s
3622                         r=s1 + s5
3623                         s=s3 + s7
3624                         cp=r + s
3625                         cm=r - s
3626                         r=s2 + s6
3627                         s=s4 + s8
3628                         dpp=r + s
3629                         dm=r - s
3630                         zout(1,j,nout1) = ap + bp
3631                         zout(2,j,nout1) = cp + dpp
3632                         zout(1,j,nout5) = ap - bp
3633                         zout(2,j,nout5) = cp - dpp
3634                         zout(1,j,nout3) = am - dm
3635                         zout(2,j,nout3) = cm + bm
3636                         zout(1,j,nout7) = am + dm
3637                         zout(2,j,nout7) = cm - bm
3638                         r= r1 - r5
3639                         s=-s3 + s7
3640                         ap=r + s
3641                         am=r - s
3642                         r=s1 - s5
3643                         s=r7 - r3
3644                         bp=r + s
3645                         bm=r - s
3646                         r=-s4 + s8
3647                         s= r2 - r6
3648                         cp=r + s
3649                         cm=r - s
3650                         r=-s2 + s6
3651                         s= r4 - r8
3652                         dpp=r + s
3653                         dm=r - s
3654                         r = ( cp + dm)*rt2i
3655                         s = ( cp - dm)*rt2i
3656                         cp= ( cm + dpp)*rt2i
3657                         dpp= ( dpp - cm)*rt2i
3658                         zout(1,j,nout2) = ap + r
3659                         zout(2,j,nout2) = bm + s
3660                         zout(1,j,nout6) = ap - r
3661                         zout(2,j,nout6) = bm - s
3662                         zout(1,j,nout4) = am + cp
3663                         zout(2,j,nout4) = bp + dpp
3664                         zout(1,j,nout8) = am - cp
3665                         zout(2,j,nout8) = bp - dpp
3666                         enddo
3667                         enddo
3668 8001                continue
3669 
3670         end if
3671         else if (now.eq.3) then
3672 !         .5d0*sqrt(3.d0)
3673         bb=isign*0.8660254037844387d0
3674         ia=1
3675         nin1=ia-after
3676         nout1=ia-atn
3677         do ib=1,before
3678         nin1=nin1+after
3679         nin2=nin1+atb
3680         nin3=nin2+atb
3681         nout1=nout1+atn
3682         nout2=nout1+after
3683         nout3=nout2+after
3684         do j=1,n1dfft
3685         r1=zin(1,j,nin1)
3686         s1=zin(2,j,nin1)
3687         r2=zin(1,j,nin2)
3688         s2=zin(2,j,nin2)
3689         r3=zin(1,j,nin3)
3690         s3=zin(2,j,nin3)
3691         r=r2 + r3
3692         s=s2 + s3
3693         zout(1,j,nout1) = r + r1
3694         zout(2,j,nout1) = s + s1
3695         r1=r1 - .5d0*r
3696         s1=s1 - .5d0*s
3697         r2=bb*(r2-r3)
3698         s2=bb*(s2-s3)
3699         zout(1,j,nout2) = r1 - s2
3700         zout(2,j,nout2) = s1 + r2
3701         zout(1,j,nout3) = r1 + s2
3702         zout(2,j,nout3) = s1 - r2
3703         enddo
3704         enddo
3705         do 3000,ia=2,after
3706         ias=ia-1
3707         if (4*ias.eq.3*after) then
3708         if (isign.eq.1) then
3709                 nin1=ia-after
3710                 nout1=ia-atn
3711                 do ib=1,before
3712                 nin1=nin1+after
3713                 nin2=nin1+atb
3714                 nin3=nin2+atb
3715                 nout1=nout1+atn
3716                 nout2=nout1+after
3717                 nout3=nout2+after
3718                 do j=1,n1dfft
3719                 r1=zin(1,j,nin1)
3720                 s1=zin(2,j,nin1)
3721                 r2=zin(2,j,nin2)
3722                 s2=zin(1,j,nin2)
3723                 r3=zin(1,j,nin3)
3724                 s3=zin(2,j,nin3)
3725                 r=r3 + r2
3726                 s=s2 - s3
3727                 zout(1,j,nout1) = r1 - r
3728                 zout(2,j,nout1) = s + s1
3729                 r1=r1 + .5d0*r
3730                 s1=s1 - .5d0*s
3731                 r2=bb*(r2-r3)
3732                 s2=bb*(s2+s3)
3733                 zout(1,j,nout2) = r1 - s2
3734                 zout(2,j,nout2) = s1 - r2
3735                 zout(1,j,nout3) = r1 + s2
3736                 zout(2,j,nout3) = s1 + r2
3737                 enddo
3738                 enddo
3739         else
3740                 nin1=ia-after
3741                 nout1=ia-atn
3742                 do ib=1,before
3743                 nin1=nin1+after
3744                 nin2=nin1+atb
3745                 nin3=nin2+atb
3746                 nout1=nout1+atn
3747                 nout2=nout1+after
3748                 nout3=nout2+after
3749                 do j=1,n1dfft
3750                 r1=zin(1,j,nin1)
3751                 s1=zin(2,j,nin1)
3752                 r2=zin(2,j,nin2)
3753                 s2=zin(1,j,nin2)
3754                 r3=zin(1,j,nin3)
3755                 s3=zin(2,j,nin3)
3756                 r=r2 - r3
3757                 s=s2 + s3
3758                 zout(1,j,nout1) = r + r1
3759                 zout(2,j,nout1) = s1 - s
3760                 r1=r1 - .5d0*r
3761                 s1=s1 + .5d0*s
3762                 r2=bb*(r2+r3)
3763                 s2=bb*(s2-s3)
3764                 zout(1,j,nout2) = r1 + s2
3765                 zout(2,j,nout2) = s1 + r2
3766                 zout(1,j,nout3) = r1 - s2
3767                 zout(2,j,nout3) = s1 - r2
3768                 enddo
3769                 enddo
3770         end if
3771         else if (8*ias.eq.3*after) then
3772         if (isign.eq.1) then
3773                 nin1=ia-after
3774                 nout1=ia-atn
3775                 do ib=1,before
3776                 nin1=nin1+after
3777                 nin2=nin1+atb
3778                 nin3=nin2+atb
3779                 nout1=nout1+atn
3780                 nout2=nout1+after
3781                 nout3=nout2+after
3782                 do j=1,n1dfft
3783                 r1=zin(1,j,nin1)
3784                 s1=zin(2,j,nin1)
3785                 r=zin(1,j,nin2)
3786                 s=zin(2,j,nin2)
3787                 r2=(r - s)*rt2i
3788                 s2=(r + s)*rt2i
3789                 r3=zin(2,j,nin3)
3790                 s3=zin(1,j,nin3)
3791                 r=r2 - r3
3792                 s=s2 + s3
3793                 zout(1,j,nout1) = r + r1
3794                 zout(2,j,nout1) = s + s1
3795                 r1=r1 - .5d0*r
3796                 s1=s1 - .5d0*s
3797                 r2=bb*(r2+r3)
3798                 s2=bb*(s2-s3)
3799                 zout(1,j,nout2) = r1 - s2
3800                 zout(2,j,nout2) = s1 + r2
3801                 zout(1,j,nout3) = r1 + s2
3802                 zout(2,j,nout3) = s1 - r2
3803                 enddo
3804                 enddo
3805         else
3806                 nin1=ia-after
3807                 nout1=ia-atn
3808                 do ib=1,before
3809                 nin1=nin1+after
3810                 nin2=nin1+atb
3811                 nin3=nin2+atb
3812                 nout1=nout1+atn
3813                 nout2=nout1+after
3814                 nout3=nout2+after
3815                 do j=1,n1dfft
3816                 r1=zin(1,j,nin1)
3817                 s1=zin(2,j,nin1)
3818                 r=zin(1,j,nin2)
3819                 s=zin(2,j,nin2)
3820                 r2=(r + s)*rt2i
3821                 s2=(s - r)*rt2i
3822                 r3=zin(2,j,nin3)
3823                 s3=zin(1,j,nin3)
3824                 r=r2 + r3
3825                 s=s2 - s3
3826                 zout(1,j,nout1) = r + r1
3827                 zout(2,j,nout1) = s + s1
3828                 r1=r1 - .5d0*r
3829                 s1=s1 - .5d0*s
3830                 r2=bb*(r2-r3)
3831                 s2=bb*(s2+s3)
3832                 zout(1,j,nout2) = r1 - s2
3833                 zout(2,j,nout2) = s1 + r2
3834                 zout(1,j,nout3) = r1 + s2
3835                 zout(2,j,nout3) = s1 - r2
3836                 enddo
3837                 enddo
3838         end if
3839         else
3840         itt=ias*before
3841         itrig=itt+1
3842         cr2=trig(1,itrig)
3843         ci2=trig(2,itrig)
3844         itrig=itrig+itt
3845         cr3=trig(1,itrig)
3846         ci3=trig(2,itrig)
3847         nin1=ia-after
3848         nout1=ia-atn
3849         do ib=1,before
3850         nin1=nin1+after
3851         nin2=nin1+atb
3852         nin3=nin2+atb
3853         nout1=nout1+atn
3854         nout2=nout1+after
3855         nout3=nout2+after
3856         do j=1,n1dfft
3857         r1=zin(1,j,nin1)
3858         s1=zin(2,j,nin1)
3859         r=zin(1,j,nin2)
3860         s=zin(2,j,nin2)
3861         r2=r*cr2 - s*ci2
3862         s2=r*ci2 + s*cr2
3863         r=zin(1,j,nin3)
3864         s=zin(2,j,nin3)
3865         r3=r*cr3 - s*ci3
3866         s3=r*ci3 + s*cr3
3867         r=r2 + r3
3868         s=s2 + s3
3869         zout(1,j,nout1) = r + r1
3870         zout(2,j,nout1) = s + s1
3871         r1=r1 - .5d0*r
3872         s1=s1 - .5d0*s
3873         r2=bb*(r2-r3)
3874         s2=bb*(s2-s3)
3875         zout(1,j,nout2) = r1 - s2
3876         zout(2,j,nout2) = s1 + r2
3877         zout(1,j,nout3) = r1 + s2
3878         zout(2,j,nout3) = s1 - r2
3879         enddo
3880         enddo
3881         end if
3882 3000        continue
3883         else if (now==5) then
3884 !         cos(2.d0*pi/5.d0)
3885         cos2=0.3090169943749474d0
3886 !         cos(4.d0*pi/5.d0)
3887         cos4=-0.8090169943749474d0
3888 !        sin(2.d0*pi/5.d0)
3889         sin2=isign*0.9510565162951536d0
3890 !         sin(4.d0*pi/5.d0)
3891         sin4=isign*0.5877852522924731d0
3892         ia=1
3893         nin1=ia-after
3894         nout1=ia-atn
3895         do ib=1,before
3896         nin1=nin1+after
3897         nin2=nin1+atb
3898         nin3=nin2+atb
3899         nin4=nin3+atb
3900         nin5=nin4+atb
3901         nout1=nout1+atn
3902         nout2=nout1+after
3903         nout3=nout2+after
3904         nout4=nout3+after
3905         nout5=nout4+after
3906         do j=1,n1dfft
3907         r1=zin(1,j,nin1)
3908         s1=zin(2,j,nin1)
3909         r2=zin(1,j,nin2)
3910         s2=zin(2,j,nin2)
3911         r3=zin(1,j,nin3)
3912         s3=zin(2,j,nin3)
3913         r4=zin(1,j,nin4)
3914         s4=zin(2,j,nin4)
3915         r5=zin(1,j,nin5)
3916         s5=zin(2,j,nin5)
3917         r25 = r2 + r5
3918         r34 = r3 + r4
3919         s25 = s2 - s5
3920         s34 = s3 - s4
3921         zout(1,j,nout1) = r1 + r25 + r34
3922         r = r1 + cos2*r25 + cos4*r34
3923         s = sin2*s25 + sin4*s34
3924         zout(1,j,nout2) = r - s
3925         zout(1,j,nout5) = r + s
3926         r = r1 + cos4*r25 + cos2*r34
3927         s = sin4*s25 - sin2*s34
3928         zout(1,j,nout3) = r - s
3929         zout(1,j,nout4) = r + s
3930         r25 = r2 - r5
3931         r34 = r3 - r4
3932         s25 = s2 + s5
3933         s34 = s3 + s4
3934         zout(2,j,nout1) = s1 + s25 + s34
3935         r = s1 + cos2*s25 + cos4*s34
3936         s = sin2*r25 + sin4*r34
3937         zout(2,j,nout2) = r + s
3938         zout(2,j,nout5) = r - s
3939         r = s1 + cos4*s25 + cos2*s34
3940         s = sin4*r25 - sin2*r34
3941         zout(2,j,nout3) = r + s
3942         zout(2,j,nout4) = r - s
3943         enddo
3944         enddo
3945         do 5000,ia=2,after
3946         ias=ia-1
3947         if (8*ias.eq.5*after) then
3948                 if (isign.eq.1) then
3949                         nin1=ia-after
3950                         nout1=ia-atn
3951                         do ib=1,before
3952                         nin1=nin1+after
3953                         nin2=nin1+atb
3954                         nin3=nin2+atb
3955                         nin4=nin3+atb
3956                         nin5=nin4+atb
3957                         nout1=nout1+atn
3958                         nout2=nout1+after
3959                         nout3=nout2+after
3960                         nout4=nout3+after
3961                         nout5=nout4+after
3962                         do j=1,n1dfft
3963                         r1=zin(1,j,nin1)
3964                         s1=zin(2,j,nin1)
3965                         r=zin(1,j,nin2)
3966                         s=zin(2,j,nin2)
3967                         r2=(r - s)*rt2i
3968                         s2=(r + s)*rt2i
3969                         r3=zin(2,j,nin3)
3970                         s3=zin(1,j,nin3)
3971                         r=zin(1,j,nin4)
3972                         s=zin(2,j,nin4)
3973                         r4=(r + s)*rt2i
3974                         s4=(r - s)*rt2i
3975                         r5=zin(1,j,nin5)
3976                         s5=zin(2,j,nin5)
3977                         r25 = r2 - r5
3978                         r34 = r3 + r4
3979                         s25 = s2 + s5
3980                         s34 = s3 - s4
3981                         zout(1,j,nout1) = r1 + r25 - r34
3982                         r = r1 + cos2*r25 - cos4*r34
3983                         s = sin2*s25 + sin4*s34
3984                         zout(1,j,nout2) = r - s
3985                         zout(1,j,nout5) = r + s
3986                         r = r1 + cos4*r25 - cos2*r34
3987                         s = sin4*s25 - sin2*s34
3988                         zout(1,j,nout3) = r - s
3989                         zout(1,j,nout4) = r + s
3990                         r25 = r2 + r5
3991                         r34 = r4 - r3
3992                         s25 = s2 - s5
3993                         s34 = s3 + s4
3994                         zout(2,j,nout1) = s1 + s25 + s34
3995                         r = s1 + cos2*s25 + cos4*s34
3996                         s = sin2*r25 + sin4*r34
3997                         zout(2,j,nout2) = r + s
3998                         zout(2,j,nout5) = r - s
3999                         r = s1 + cos4*s25 + cos2*s34
4000                         s = sin4*r25 - sin2*r34
4001                         zout(2,j,nout3) = r + s
4002                         zout(2,j,nout4) = r - s
4003                         enddo
4004                         enddo
4005                 else
4006                         nin1=ia-after
4007                         nout1=ia-atn
4008                         do ib=1,before
4009                         nin1=nin1+after
4010                         nin2=nin1+atb
4011                         nin3=nin2+atb
4012                         nin4=nin3+atb
4013                         nin5=nin4+atb
4014                         nout1=nout1+atn
4015                         nout2=nout1+after
4016                         nout3=nout2+after
4017                         nout4=nout3+after
4018                         nout5=nout4+after
4019                         do j=1,n1dfft
4020                         r1=zin(1,j,nin1)
4021                         s1=zin(2,j,nin1)
4022                         r=zin(1,j,nin2)
4023                         s=zin(2,j,nin2)
4024                         r2=(r + s)*rt2i
4025                         s2=(s - r)*rt2i
4026                         r3=zin(2,j,nin3)
4027                         s3=zin(1,j,nin3)
4028                         r=zin(1,j,nin4)
4029                         s=zin(2,j,nin4)
4030                         r4=(s - r)*rt2i
4031                         s4=(r + s)*rt2i
4032                         r5=zin(1,j,nin5)
4033                         s5=zin(2,j,nin5)
4034                         r25 = r2 - r5
4035                         r34 = r3 + r4
4036                         s25 = s2 + s5
4037                         s34 = s4 - s3
4038                         zout(1,j,nout1) = r1 + r25 + r34
4039                         r = r1 + cos2*r25 + cos4*r34
4040                         s = sin2*s25 + sin4*s34
4041                         zout(1,j,nout2) = r - s
4042                         zout(1,j,nout5) = r + s
4043                         r = r1 + cos4*r25 + cos2*r34
4044                         s = sin4*s25 - sin2*s34
4045                         zout(1,j,nout3) = r - s
4046                         zout(1,j,nout4) = r + s
4047                         r25 = r2 + r5
4048                         r34 = r3 - r4
4049                         s25 = s2 - s5
4050                         s34 = s3 + s4
4051                         zout(2,j,nout1) = s1 + s25 - s34
4052                         r = s1 + cos2*s25 - cos4*s34
4053                         s = sin2*r25 + sin4*r34
4054                         zout(2,j,nout2) = r + s
4055                         zout(2,j,nout5) = r - s
4056                         r = s1 + cos4*s25 - cos2*s34
4057                         s = sin4*r25 - sin2*r34
4058                         zout(2,j,nout3) = r + s
4059                         zout(2,j,nout4) = r - s
4060                         enddo
4061                         enddo
4062                 end if
4063         else
4064                 ias=ia-1
4065                 itt=ias*before
4066                 itrig=itt+1
4067                 cr2=trig(1,itrig)
4068                 ci2=trig(2,itrig)
4069                 itrig=itrig+itt
4070                 cr3=trig(1,itrig)
4071                 ci3=trig(2,itrig)
4072                 itrig=itrig+itt
4073                 cr4=trig(1,itrig)
4074                 ci4=trig(2,itrig)
4075                 itrig=itrig+itt
4076                 cr5=trig(1,itrig)
4077                 ci5=trig(2,itrig)
4078                 nin1=ia-after
4079                 nout1=ia-atn
4080                 do ib=1,before
4081                 nin1=nin1+after
4082                 nin2=nin1+atb
4083                 nin3=nin2+atb
4084                 nin4=nin3+atb
4085                 nin5=nin4+atb
4086                 nout1=nout1+atn
4087                 nout2=nout1+after
4088                 nout3=nout2+after
4089                 nout4=nout3+after
4090                 nout5=nout4+after
4091                 do j=1,n1dfft
4092                 r1=zin(1,j,nin1)
4093                 s1=zin(2,j,nin1)
4094                 r=zin(1,j,nin2)
4095                 s=zin(2,j,nin2)
4096                 r2=r*cr2 - s*ci2
4097                 s2=r*ci2 + s*cr2
4098                 r=zin(1,j,nin3)
4099                 s=zin(2,j,nin3)
4100                 r3=r*cr3 - s*ci3
4101                 s3=r*ci3 + s*cr3
4102                 r=zin(1,j,nin4)
4103                 s=zin(2,j,nin4)
4104                 r4=r*cr4 - s*ci4
4105                 s4=r*ci4 + s*cr4
4106                 r=zin(1,j,nin5)
4107                 s=zin(2,j,nin5)
4108                 r5=r*cr5 - s*ci5
4109                 s5=r*ci5 + s*cr5
4110                 r25 = r2 + r5
4111                 r34 = r3 + r4
4112                 s25 = s2 - s5
4113                 s34 = s3 - s4
4114                 zout(1,j,nout1) = r1 + r25 + r34
4115                 r = r1 + cos2*r25 + cos4*r34
4116                 s = sin2*s25 + sin4*s34
4117                 zout(1,j,nout2) = r - s
4118                 zout(1,j,nout5) = r + s
4119                 r = r1 + cos4*r25 + cos2*r34
4120                 s = sin4*s25 - sin2*s34
4121                 zout(1,j,nout3) = r - s
4122                 zout(1,j,nout4) = r + s
4123                 r25 = r2 - r5
4124                 r34 = r3 - r4
4125                 s25 = s2 + s5
4126                 s34 = s3 + s4
4127                 zout(2,j,nout1) = s1 + s25 + s34
4128                 r = s1 + cos2*s25 + cos4*s34
4129                 s = sin2*r25 + sin4*r34
4130                 zout(2,j,nout2) = r + s
4131                 zout(2,j,nout5) = r - s
4132                 r = s1 + cos4*s25 + cos2*s34
4133                 s = sin4*r25 - sin2*r34
4134                 zout(2,j,nout3) = r + s
4135                 zout(2,j,nout4) = r - s
4136                 enddo
4137                 enddo
4138         end if
4139 5000        continue
4140        else if (now.eq.6) then
4141 !         .5d0*sqrt(3.d0)
4142         bb=isign*0.8660254037844387d0
4143 
4144         ia=1
4145         nin1=ia-after
4146         nout1=ia-atn
4147         do ib=1,before
4148         nin1=nin1+after
4149         nin2=nin1+atb
4150         nin3=nin2+atb
4151         nin4=nin3+atb
4152         nin5=nin4+atb
4153         nin6=nin5+atb
4154         nout1=nout1+atn
4155         nout2=nout1+after
4156         nout3=nout2+after
4157         nout4=nout3+after
4158         nout5=nout4+after
4159         nout6=nout5+after
4160         do j=1,n1dfft
4161         r2=zin(1,j,nin3)
4162         s2=zin(2,j,nin3)
4163         r3=zin(1,j,nin5)
4164         s3=zin(2,j,nin5)
4165         r=r2 + r3
4166         s=s2 + s3
4167         r1=zin(1,j,nin1)
4168         s1=zin(2,j,nin1)
4169         ur1 = r + r1
4170         ui1 = s + s1
4171         r1=r1 - .5d0*r
4172         s1=s1 - .5d0*s
4173         r=r2-r3
4174         s=s2-s3
4175         ur2 = r1 - s*bb
4176         ui2 = s1 + r*bb
4177         ur3 = r1 + s*bb
4178         ui3 = s1 - r*bb
4179 
4180         r2=zin(1,j,nin6)
4181         s2=zin(2,j,nin6)
4182         r3=zin(1,j,nin2)
4183         s3=zin(2,j,nin2)
4184         r=r2 + r3
4185         s=s2 + s3
4186         r1=zin(1,j,nin4)
4187         s1=zin(2,j,nin4)
4188         vr1 = r + r1
4189         vi1 = s + s1
4190         r1=r1 - .5d0*r
4191         s1=s1 - .5d0*s
4192         r=r2-r3
4193         s=s2-s3
4194         vr2 = r1 - s*bb
4195         vi2 = s1 + r*bb
4196         vr3 = r1 + s*bb
4197         vi3 = s1 - r*bb
4198 
4199         zout(1,j,nout1)=ur1+vr1
4200         zout(2,j,nout1)=ui1+vi1
4201         zout(1,j,nout5)=ur2+vr2
4202         zout(2,j,nout5)=ui2+vi2
4203         zout(1,j,nout3)=ur3+vr3
4204         zout(2,j,nout3)=ui3+vi3
4205         zout(1,j,nout4)=ur1-vr1
4206         zout(2,j,nout4)=ui1-vi1
4207         zout(1,j,nout2)=ur2-vr2
4208         zout(2,j,nout2)=ui2-vi2
4209         zout(1,j,nout6)=ur3-vr3
4210         zout(2,j,nout6)=ui3-vi3
4211         enddo
4212         enddo
4213 
4214         else
4215           MSG_ERROR('error fftstp')
4216         end if
4217 
4218 end subroutine fftstp

m_sg2002/sg2002_accrho [ Functions ]

[ Top ] [ m_sg2002 ] [ Functions ]

NAME

 sg2002_accrho

FUNCTION

 Accumulates the real space density rho from the ndat wavefunctions zf
 by transforming zf into real space and adding all the amplitudes squared

 INPUTS:
   ZF: input array (note the switch of i2 and i3)
         real(F(i1,i3,i2,idat))=ZF(1,i1,i3,i2,idat)
         imag(F(i1,i3,i2,idat))=ZF(2,i1,i3,i2,idat)
   max1 is positive or zero ; m1 >=max1+1
   i1= 1... max1+1 corresponds to positive and zero wavevectors 0 ... max1
   then, if m1 > max1+1, one has min1=max1-m1+1 and
   i1= max1+2 ... m1 corresponds to negative wavevectors min1 ... -1
   i2 and i3 have a similar definition of range
   idat=1,ndat
   md1,md2,md3: Dimension of ZF
   md2proc=((md2-1)/nproc_fft)+1 ! maximal number of small box 2nd dim slices for one proc
   weight(ndat)= weight for the density accumulation

 OUTPUTS:
    RHOoutput(i1,i2,i3) = RHOinput(i1,i2,i3) + sum on idat of (Re(FFT(ZF))**2 *weight_r + weight_i*Im(FFT(ZF))**2
        i1=1,n1 , i2=1,n2 , i3=1,n3
   comm_fft: MPI communicator
   nproc_fft: number of processors used as returned by MPI_COMM_SIZE
   me_fft: [0:nproc_fft-1] number of processor as returned by MPI_COMM_RANK
    n1,n2,n3: logical dimension of the transform. As transform lengths
              most products of the prime factors 2,3,5 are allowed.
             The detailed table with allowed transform lengths can
             be found in subroutine CTRIG
    nd1,nd2,nd3: Dimension of RHO
   nd3proc=((nd3-1)/nproc_fft)+1 ! maximal number of big box 3rd dim slices for one proc

 NOTES:
   PERFORMANCE CONSIDERATIONS:
   The maximum number of processors that can reasonably be used is max(n2/2,n3/2)

   It is very important to find the optimal
   value of NCACHE. NCACHE determines the size of the work array ZW, that
   has to fit into cache. It has therefore to be chosen to equal roughly
    half the size of the physical cache in units of real*8 numbers.
   The optimal value of ncache can easily be determined by numerical
   experimentation. A too large value of ncache leads to a dramatic
   and sudden decrease of performance, a too small value to a to a
   slow and less dramatic decrease of performance. If NCACHE is set
   to a value so small, that not even a single one dimensional transform
   can be done in the workarray zw, the program stops with an error message.

PARENTS

      m_fft

CHILDREN

SOURCE

2358 subroutine sg2002_accrho(icplexwf,ndat,n1,n2,n3,nd1,nd2,nd3,nd3proc,&
2359 &  max1,max2,max3,m1,m2,m3,md1,md2proc,md3,comm_fft,nproc_fft,me_fft,zf,rho,weight_r,weight_i)
2360 
2361 
2362 !This section has been created automatically by the script Abilint (TD).
2363 !Do not modify the following lines by hand.
2364 #undef ABI_FUNC
2365 #define ABI_FUNC 'sg2002_accrho'
2366 !End of the abilint section
2367 
2368  implicit none
2369 
2370 !Arguments ------------------------------------
2371  integer,intent(in) :: icplexwf,ndat,n1,n2,n3,nd1,nd2,nd3,nd3proc
2372  integer,intent(in) :: max1,max2,max3,m1,m2,m3,md1,md2proc,md3,comm_fft,nproc_fft,me_fft
2373  real(dp),intent(in) :: zf(2,md1,md3,md2proc,ndat)
2374  real(dp),intent(in) :: weight_r(ndat), weight_i(ndat)
2375  real(dp),intent(inout) :: rho(nd1,nd2,nd3)
2376 
2377 !Local variables-------------------------------
2378 !scalars
2379  integer,parameter :: unused0=0
2380  integer :: i,j,i1,ic1,ic2,ic3,idat,ierr,inzee,j3glob
2381  integer :: ioption,j2,j3,j2st,jp2st,lot,lzt,m1zt,ma,mb,n1dfft,nnd3
2382  integer :: m2eff,ncache,n1eff,jeff,includelast
2383 !arrays
2384  real(dp), allocatable :: zmpi1(:,:,:,:),zmpi2(:,:,:,:) ! work arrays for MPI
2385  real(dp), allocatable :: zt(:,:,:)  ! work arrays for transpositions
2386  real(dp), allocatable :: zw(:,:,:) ! cache work array
2387  real(dp) :: tsec(2)
2388 ! FFT work arrays
2389  real(dp), allocatable, dimension(:,:) :: trig1,trig2,trig3
2390  integer, allocatable, dimension(:) :: after1,now1,before1, after2,now2,before2,after3,now3,before3
2391 
2392 ! *************************************************************************
2393 
2394  !ioption=0 ! This was in the old version.
2395  ioption=1 ! This one is needed to be compatible with paral_kgb
2396 
2397  !nproc_fft = xmpi_comm_size(comm_fft); me_fft = xmpi_comm_rank(comm_fft)
2398 
2399 ! find cache size that gives optimal performance on machine
2400  ncache=4*max(n1,n2,n3,1024)
2401  if (ncache/(4*max(n1,n2,n3)) < 1) then
2402     write(std_out,*) &
2403 &     'ncache has to be enlarged to be able to hold at', &
2404 &     'least one 1-d FFT of each size even though this will', &
2405 &     'reduce the performance for shorter transform lengths'
2406     MSG_ERROR("Aborting now")
2407  end if
2408 
2409 !Effective m1 and m2 (complex-to-complex or real-to-complex)
2410  n1eff=n1; m2eff=m2 ; m1zt=n1
2411  if (icplexwf==1) then
2412    n1eff=(n1+1)/2; m2eff=m2/2+1; m1zt=2*(n1/2+1)
2413  end if
2414 
2415  lzt=m2eff
2416  if (mod(m2eff,2) == 0) lzt=lzt+1
2417  if (mod(m2eff,4) == 0) lzt=lzt+1
2418 
2419  ! maximal number of big box 3rd dim slices for all procs
2420  nnd3=nd3proc*nproc_fft
2421 
2422  ABI_ALLOCATE(trig1,(2,n1))
2423  ABI_ALLOCATE(after1,(mdata))
2424  ABI_ALLOCATE(now1,(mdata))
2425  ABI_ALLOCATE(before1,(mdata))
2426  ABI_ALLOCATE(trig2,(2,n2))
2427  ABI_ALLOCATE(after2,(mdata))
2428  ABI_ALLOCATE(now2,(mdata))
2429  ABI_ALLOCATE(before2,(mdata))
2430  ABI_ALLOCATE(trig3,(2,n3))
2431  ABI_ALLOCATE(after3,(mdata))
2432  ABI_ALLOCATE(now3,(mdata))
2433  ABI_ALLOCATE(before3,(mdata))
2434 
2435  ABI_ALLOCATE(zw,(2,ncache/4,2))
2436  ABI_ALLOCATE(zt,(2,lzt,m1zt))
2437  ABI_ALLOCATE(zmpi2,(2,md1,md2proc,nnd3))
2438  if (nproc_fft > 1)  then
2439    ABI_ALLOCATE(zmpi1,(2,md1,md2proc,nnd3))
2440  end if
2441 
2442  call ctrig(n3,trig3,after3,before3,now3,1,ic3)
2443  call ctrig(n1,trig1,after1,before1,now1,1,ic1)
2444  call ctrig(n2,trig2,after2,before2,now2,1,ic2)
2445 
2446  do idat=1,ndat
2447    ! transform along z axis
2448    ! input: I1,I3,J2,(Jp2)
2449    lot=ncache/(4*n3)
2450 
2451    ! Loop over the y planes treated by this node and trasform n1ddft G_z lines.
2452    do j2=1,md2proc
2453      if (me_fft*md2proc+j2 <= m2eff) then ! MG REMOVED TO BE COSISTENT WITH BACK_WF
2454        do i1=1,m1,lot
2455          ma=i1
2456          mb=min(i1+(lot-1),m1)
2457          n1dfft=mb-ma+1
2458 
2459          ! zero-pad n1dfft G_z lines
2460          !  input: G1,G3,G2,(Gp2)
2461          ! output: G1,R3,G2,(Gp2)
2462          call fill_cent(md1,md3,lot,n1dfft,max3,m3,n3,zf(1,i1,1,j2,idat),zw(1,1,1))
2463 
2464          ! Transform along z.
2465          inzee=1
2466          do i=1,ic3
2467            call fftstp(lot,n1dfft,n3,lot,n3,zw(1,1,inzee),zw(1,1,3-inzee), &
2468 &                       trig3,after3(i),now3(i),before3(i),1)
2469            inzee=3-inzee
2470          end do
2471 
2472          ! Local rotation.
2473          ! input:  G1,R3,G2,(Gp2)
2474          ! output: G1,G2,R3,(Gp2)
2475          call scramble(i1,j2,lot,n1dfft,md1,n3,md2proc,nnd3,zw(1,1,inzee),zmpi2)
2476        end do
2477      end if
2478    end do
2479 
2480    ! Interprocessor data transposition
2481    ! input:  G1,G2,R3,Rp3,(Gp2)
2482    ! output: G1,G2,R3,Gp2,(Rp3)
2483    if (nproc_fft > 1) then
2484      call timab(543,1,tsec)
2485      call xmpi_alltoall(zmpi2,2*md1*md2proc*nd3proc, &
2486 &                       zmpi1,2*md1*md2proc*nd3proc,comm_fft,ierr)
2487      call timab(543,2,tsec)
2488    end if
2489 
2490    ! Loop over the z treated by this node.
2491    do j3=1,nd3proc
2492      j3glob = j3 + me_fft*nd3proc
2493      !ABI_CHECK(j3glob <= n3, "j3glob")
2494 
2495      if (me_fft*nd3proc+j3 <= n3) then
2496        Jp2st=1; J2st=1
2497 
2498        lot=ncache/(4*n1)
2499 
2500        ! Loop over G_y in the small box.
2501        do j=1,m2eff,lot
2502          ma=j
2503          mb=min(j+(lot-1),m2eff)
2504          n1dfft=mb-ma+1
2505 
2506          ! Zero-pad input.
2507          ! input:  G1,G2,R3,JG2,(Rp3)
2508          ! output: G2,G1,R3,JG2,(Rp3)
2509 
2510          if (nproc_fft == 1) then
2511           call mpiswitch_cent(j3,n1dfft,Jp2st,J2st,lot,max1,md1,m1,n1,&
2512 &           md2proc,nd3proc,nproc_fft,ioption,zmpi2,zw(1,1,1),unused0, unused0,unused0)
2513          else
2514           call mpiswitch_cent(j3,n1dfft,Jp2st,J2st,lot,max1,md1,m1,n1,&
2515 &           md2proc,nd3proc,nproc_fft,ioption,zmpi1,zw(1,1,1), unused0,unused0,unused0)
2516          end if
2517 
2518          ! Transform along x
2519          ! input:  G2,G1,R3,(Rp3)
2520          ! output: G2,R1,R3,(Rp3)
2521          inzee=1
2522          do i=1,ic1-1
2523            call fftstp(lot,n1dfft,n1,lot,n1,zw(1,1,inzee),zw(1,1,3-inzee), &
2524 &                    trig1,after1(i),now1(i),before1(i),1)
2525            inzee=3-inzee
2526          end do
2527 
2528          i=ic1
2529          call fftstp(lot,n1dfft,n1,lzt,m1zt,zw(1,1,inzee),zt(1,j,1), &
2530 &                    trig1,after1(i),now1(i),before1(i),1)
2531        end do
2532 
2533        ! Transform along y axis (take into account c2c or c2r case).
2534        ! Must loop over the full box.
2535        lot=ncache/(4*n2)
2536        if (icplexwf==1) then
2537          if (mod(lot,2).ne.0) lot=lot-1 ! needed to introduce jeff
2538        end if
2539 
2540        do j=1,n1eff,lot
2541          ma=j
2542          mb=min(j+(lot-1),n1eff)
2543          n1dfft=mb-ma+1
2544          jeff=j
2545          includelast=1
2546 
2547          if (icplexwf==1) then
2548            jeff=2*j-1
2549            includelast=1
2550            if (mb==n1eff .and. n1eff*2/=n1) includelast=0
2551          end if
2552 
2553          ! Zero-pad the input.
2554          ! input:  G2,R1,R3,(Rp3)
2555          ! output: R1,G2,R3,(Rp3)
2556          if (icplexwf==2) then
2557            call switch_cent(n1dfft,max2,m2,n2,lot,n1,lzt,zt(1,1,j),zw(1,1,1))
2558          else
2559            call switchreal_cent(includelast,n1dfft,max2,n2,lot,m1zt,lzt,zt(1,1,jeff),zw(1,1,1))
2560          end if
2561 
2562          inzee=1
2563          do i=1,ic2
2564            call fftstp(lot,n1dfft,n2,lot,n2,zw(1,1,inzee),zw(1,1,3-inzee), &
2565 &                       trig2,after2(i),now2(i),before2(i),1)
2566            inzee=3-inzee
2567          end do
2568 
2569          ! Accumulate
2570          call addrho(icplexwf,includelast,nd1,nd2,n2,lot,n1dfft,&
2571 &          zw(1,1,inzee),rho(jeff,1,j3glob),weight_r(idat),weight_i(idat))
2572        end do
2573        ! output: i1,i2,j3,(jp3)
2574 
2575       end if
2576     end do ! j3
2577  end do ! idat
2578 
2579  ABI_DEALLOCATE(trig1)
2580  ABI_DEALLOCATE(after1)
2581  ABI_DEALLOCATE(now1)
2582  ABI_DEALLOCATE(before1)
2583  ABI_DEALLOCATE(trig2)
2584  ABI_DEALLOCATE(after2)
2585  ABI_DEALLOCATE(now2)
2586  ABI_DEALLOCATE(before2)
2587  ABI_DEALLOCATE(trig3)
2588  ABI_DEALLOCATE(after3)
2589  ABI_DEALLOCATE(now3)
2590  ABI_DEALLOCATE(before3)
2591 
2592  ABI_DEALLOCATE(zmpi2)
2593  ABI_DEALLOCATE(zw)
2594  ABI_DEALLOCATE(zt)
2595  if (nproc_fft > 1)  then
2596    ABI_DEALLOCATE(zmpi1)
2597  end if
2598 
2599 end subroutine sg2002_accrho

m_sg2002/sg2002_applypot [ Functions ]

[ Top ] [ m_sg2002 ] [ Functions ]

NAME

  sg2002_applypot

FUNCTION

 Applies the local real space potential to multiple wavefunctions in Fourier space

INPUTS

   ZF: Wavefunction (input/output) (note the switch of i2 and i3)
        real(F(i1,i3,i2,idat))=ZF(1,i1,i3,i2,idat)
        imag(F(i1,i3,i2,idat))=ZF(2,i1,i3,i2,idat)
   max1 is positive or zero ; m1 >=max1+1
   i1= 1... max1+1 corresponds to positive and zero wavevectors 0 ... max1
   then, if m1 > max1+1, one has min1=max1-m1+1 and
   i1= max1+2 ... m1 corresponds to negative wavevectors min1 ... -1
   i2 and i3 have a similar definition of range
   idat=1,ndat
   md1,md2,md3: Dimension of ZF (input as well as output), distributed on different procs
   md2proc=((md2-1)/nproc_fft)+1  maximal number of small box 2nd dim slices for one proc

   POT: Potential
        POT(cplex*i1,i2,i3)
        cplex=1 or 2 ,  i1=1,n1 , i2=1,n2 , i3=1,n3
   nd1,nd2,nd3: dimension of pot
   comm_fft: MPI communicator
   nproc_fft: number of processors used as returned by MPI_COMM_SIZE
   me_fft: [0:nproc_fft-1] number of processor as returned by MPI_COMM_RANK
    n1,n2,n3: logical dimension of the transform. As transform lengths
              most products of the prime factors 2,3,5 are allowed.
             The detailed table with allowed transform lengths can
             be found in subroutine CTRIG

 NOTES:
   PERFORMANCE CONSIDERATIONS:
   The maximum number of processors that can reasonably be used is max(n2/2,n3/2)

   It is very important to find the optimal
   value of NCACHE. NCACHE determines the size of the work array ZW, that
   has to fit into cache. It has therefore to be chosen to equal roughly
    half the size of the physical cache in units of real*8 numbers.
   The optimal value of ncache can easily be determined by numerical
   experimentation. A too large value of ncache leads to a dramatic
   and sudden decrease of performance, a too small value to a to a
   slow and less dramatic decrease of performance. If NCACHE is set
   to a value so small, that not even a single one dimensional transform
   can be done in the workarray zw, the program stops with an error message.

PARENTS

      m_fft

CHILDREN

SOURCE

1489 subroutine sg2002_applypot(icplexwf,cplex,ndat,n1,n2,n3,nd1,nd2,nd3,nd3proc,&
1490 &  max1i,max2i,max3i,m1i,m2i,m3i,md1,md2proc,md3,&
1491 &  max1o,max2o,max3o,m1o,m2o,m3o,comm_fft,nproc_fft,me_fft,pot,zf)
1492 
1493 
1494 !This section has been created automatically by the script Abilint (TD).
1495 !Do not modify the following lines by hand.
1496 #undef ABI_FUNC
1497 #define ABI_FUNC 'sg2002_applypot'
1498 !End of the abilint section
1499 
1500  implicit none
1501 
1502 !Arguments ------------------------------------
1503  integer,intent(in) :: icplexwf,cplex,ndat,n1,n2,n3,nd1,nd2,nd3,nd3proc
1504  integer,intent(in) :: max1i,max2i,max3i,m1i,m2i,m3i,md1,md2proc,md3
1505  integer,intent(in) :: max1o,max2o,max3o,m1o,m2o,m3o,comm_fft,nproc_fft,me_fft
1506  real(dp),intent(in) :: pot(cplex*nd1,nd2,nd3)
1507  real(dp),intent(inout) :: zf(2,md1,md3,md2proc,ndat)
1508 
1509 !Local variables-------------------------------
1510 !scalars
1511  integer,parameter :: unused0=0
1512  integer :: i,j,i1,i2,i3,ic1,ic2,ic3,idat,ierr,inzee,j3glob
1513  integer :: ioption,j2,j3,lot,lzt,m1zt,ma,mb,n1dfft,nnd3
1514  integer :: m2eff,ncache,n1eff,i1inv,i2inv,i3inv,jeff,includelast,j2stb
1515  integer :: jx,j2stf,Jp2stb,Jp2stf,m2ieff,m2oeff
1516 !arrays
1517  real(dp) :: tsec(2)
1518  real(dp), allocatable :: zt(:,:,:) ! work arrays for transpositions
1519  real(dp), allocatable :: zmpi1(:,:,:,:),zmpi2(:,:,:,:) ! work arrays for MPI
1520  real(dp), allocatable :: zw(:,:,:) ! cache work array
1521 ! FFT work arrays
1522  real(dp), allocatable, dimension(:,:) :: btrig1,btrig2,btrig3
1523  real(dp), allocatable, dimension(:,:) :: ftrig1,ftrig2,ftrig3
1524  integer, allocatable, dimension(:) :: after1,now1,before1,after2,now2,before2,after3,now3,before3
1525 
1526 ! *************************************************************************
1527 
1528  !ioption=0 ! This was in the old version.
1529  ioption=1 ! This one is needed to be compatible with paral_kgb
1530 
1531  ncache=4*max(n1,n2,n3,1024)
1532  if (ncache/(4*max(n1,n2,n3)) < 1) then
1533    write(std_out,*) &
1534 &    'ncache has to be enlarged to be able to hold at', &
1535 &    'least one 1-d FFT of each size even though this will', &
1536 &    'reduce the performance for shorter transform lengths'
1537    MSG_ERROR("Aborting now")
1538  end if
1539 
1540  ! Effective m1 and m2 (complex-to-complex or real-to-complex)
1541  n1eff=n1; m2ieff=m2i; m2oeff=m2o; m1zt=n1
1542  if (icplexwf==1) then
1543    n1eff=(n1+1)/2; m2ieff=m2i/2+1; m2oeff=m2o/2+1; m1zt=2*(n1/2+1)
1544  end if
1545 
1546  m2eff=max(m2ieff,m2oeff)
1547  lzt=m2eff
1548  if (mod(m2eff,2) == 0) lzt=lzt+1
1549  if (mod(m2eff,4) == 0) lzt=lzt+1
1550 
1551  ! maximal number of big box 3rd dim slices for all procs
1552  nnd3=nd3proc*nproc_fft
1553 
1554  ABI_ALLOCATE(btrig1,(2,n1))
1555  ABI_ALLOCATE(ftrig1,(2,n1))
1556  ABI_ALLOCATE(after1,(mdata))
1557  ABI_ALLOCATE(now1,(mdata))
1558  ABI_ALLOCATE(before1,(mdata))
1559  ABI_ALLOCATE(btrig2,(2,n2))
1560  ABI_ALLOCATE(ftrig2,(2,n2))
1561  ABI_ALLOCATE(after2,(mdata))
1562  ABI_ALLOCATE(now2,(mdata))
1563  ABI_ALLOCATE(before2,(mdata))
1564  ABI_ALLOCATE(btrig3,(2,n3))
1565  ABI_ALLOCATE(ftrig3,(2,n3))
1566  ABI_ALLOCATE(after3,(mdata))
1567  ABI_ALLOCATE(now3,(mdata))
1568  ABI_ALLOCATE(before3,(mdata))
1569 
1570  ABI_ALLOCATE(zw,(2,ncache/4,2))
1571  ABI_ALLOCATE(zt,(2,lzt,m1zt))
1572  ABI_ALLOCATE(zmpi2,(2,md1,md2proc,nnd3))
1573  if (nproc_fft > 1)  then
1574    ABI_ALLOCATE(zmpi1,(2,md1,md2proc,nnd3))
1575  end if
1576 
1577  call ctrig(n3,btrig3,after3,before3,now3,1,ic3)
1578  call ctrig(n1,btrig1,after1,before1,now1,1,ic1)
1579  call ctrig(n2,btrig2,after2,before2,now2,1,ic2)
1580 
1581  do j=1,n1
1582    ftrig1(1,j)= btrig1(1,j)
1583    ftrig1(2,j)=-btrig1(2,j)
1584  end do
1585  do j=1,n2
1586    ftrig2(1,j)= btrig2(1,j)
1587    ftrig2(2,j)=-btrig2(2,j)
1588  end do
1589  do j=1,n3
1590    ftrig3(1,j)= btrig3(1,j)
1591    ftrig3(2,j)=-btrig3(2,j)
1592  end do
1593 
1594  do idat=1,ndat
1595    !
1596    ! transform along z axis
1597    ! input: G1,G3,G2,(Gp2)
1598    lot=ncache/(4*n3)
1599    do j2=1,md2proc
1600      if (me_fft*md2proc+j2 <= m2ieff) then
1601        do i1=1,m1i,lot
1602          ma=i1
1603          mb=min(i1+(lot-1),m1i)
1604          n1dfft=mb-ma+1
1605 
1606          ! zero-pad n1dfft G_z lines
1607          ! input: G1,G3,G2,(Gp2)
1608          call fill_cent(md1,md3,lot,n1dfft,max3i,m3i,n3,zf(1,i1,1,j2,idat),zw(1,1,1))
1609 
1610          inzee=1
1611          do i=1,ic3
1612            call fftstp(lot,n1dfft,n3,lot,n3,zw(1,1,inzee),zw(1,1,3-inzee), &
1613 &                      btrig3,after3(i),now3(i),before3(i),1)
1614            inzee=3-inzee
1615          end do
1616 
1617          ! Local rotation.
1618          ! input:  G1,R3,G2,(Gp2)
1619          ! output: G1,G2,R3,(Gp2)
1620          call scramble(i1,j2,lot,n1dfft,md1,n3,md2proc,nnd3,zw(1,1,inzee),zmpi2)
1621        end do
1622      end if
1623    end do
1624 
1625    ! Interprocessor data transposition
1626    ! input:  G1,G2,R3,Rp3,(Gp2)
1627    ! output: G1,G2,R3,Gp2,(Rp3)
1628    if (nproc_fft > 1) then
1629       call timab(543,1,tsec)
1630       call xmpi_alltoall(zmpi2,2*md1*md2proc*nd3proc,&
1631 &                        zmpi1,2*md1*md2proc*nd3proc,comm_fft,ierr)
1632       call timab(543,2,tsec)
1633    end if
1634 
1635    do j3=1,nd3proc
1636      j3glob = j3 + me_fft*nd3proc
1637 
1638      if (me_fft*nd3proc+j3 <= n3) then
1639        Jp2stb=1; J2stb=1
1640        Jp2stf=1; J2stf=1
1641 
1642        ! transform along x axis
1643        lot=ncache/(4*n1)
1644 
1645        do j=1,m2ieff,lot
1646          ma=j
1647          mb=min(j+(lot-1),m2ieff)
1648          n1dfft=mb-ma+1
1649 
1650          ! Zero-pad input.
1651          ! input:  G1,G2,R3,G2,(Rp3)
1652          ! output: G2,G1,R3,G2,(Rp3)
1653          if (nproc_fft == 1) then
1654            call mpiswitch_cent(j3,n1dfft,Jp2stb,J2stb,lot,max1i,md1,m1i,n1,&
1655 &           md2proc,nd3proc,nproc_fft,ioption,zmpi2,zw(1,1,1), unused0, unused0, unused0)
1656          else
1657            call mpiswitch_cent(j3,n1dfft,Jp2stb,J2stb,lot,max1i,md1,m1i,n1,&
1658 &           md2proc,nd3proc,nproc_fft,ioption,zmpi1,zw(1,1,1), unused0, unused0, unused0)
1659          end if
1660 
1661          ! Transform along x
1662          ! input:  G2,G1,R3,(Rp3)
1663          ! output: G2,R1,R3,(Rp3)
1664          inzee=1
1665          do i=1,ic1-1
1666            call fftstp(lot,n1dfft,n1,lot,n1,zw(1,1,inzee),zw(1,1,3-inzee), &
1667 &                     btrig1,after1(i),now1(i),before1(i),1)
1668            inzee=3-inzee
1669          end do
1670 
1671          i=ic1
1672          call fftstp(lot,n1dfft,n1,lzt,m1zt,zw(1,1,inzee),zt(1,j,1), &
1673 &                    btrig1,after1(i),now1(i),before1(i),1)
1674        end do
1675 
1676        ! Transform along y axis (take into account c2c or c2r case).
1677        ! Must loop over the full box.
1678        lot=ncache/(4*n2)
1679 
1680        if (icplexwf==1) then
1681          if(mod(lot,2).ne.0)lot=lot-1 ! needed to introduce jeff
1682        end if
1683 
1684        do j=1,n1eff,lot
1685          ma=j
1686          mb=min(j+(lot-1),n1eff)
1687          n1dfft=mb-ma+1
1688          jeff=j
1689          includelast=1
1690 
1691          if (icplexwf==1) then
1692            jeff=2*j-1
1693            includelast=1
1694            if (mb==n1eff .and. n1eff*2/=n1) includelast=0
1695          end if
1696 
1697          ! Zero-pad the input.
1698          !  input: G2,R1,R3,(Rp3)
1699          ! output: R1,G2,R3,(Rp3)
1700          if (icplexwf==2) then
1701            call switch_cent(n1dfft,max2i,m2i,n2,lot,n1,lzt,zt(1,1,jeff),zw(1,1,1))
1702          else
1703            call switchreal_cent(includelast,n1dfft,max2i,n2,lot,m1zt,lzt,zt(1,1,jeff),zw(1,1,1))
1704          end if
1705 
1706          ! input:  R1,G2,R3,(Rp3)
1707          ! output: R1,R2,R3,(Rp3)
1708          inzee=1
1709          do i=1,ic2
1710            call fftstp(lot,n1dfft,n2,lot,n2,zw(1,1,inzee),zw(1,1,3-inzee), &
1711 &                       btrig2,after2(i),now2(i),before2(i),1)
1712             inzee=3-inzee
1713          end do
1714          ! output: R1,R2,R3,(Rp3)
1715 
1716          ! Multiply with potential in real space
1717          jx=cplex*(jeff-1)+1
1718          call multpot(icplexwf,cplex,includelast,nd1,nd2,n2,lot,n1dfft,pot(jx,1,j3glob),zw(1,1,inzee))
1719 
1720          ! TRANSFORM BACK IN FOURIER SPACE
1721          ! transform along y axis
1722          ! input: R1,R2,R3,(Rp3)
1723          do i=1,ic2
1724            call fftstp(lot,n1dfft,n2,lot,n2,zw(1,1,inzee),zw(1,1,3-inzee), &
1725 &                       ftrig2,after2(i),now2(i),before2(i),-1)
1726            inzee=3-inzee
1727          end do
1728 
1729          !  input: R1,G2,R3,(Rp3)
1730          ! output: G2,R1,R3,(Rp3)
1731          if (icplexwf==2) then
1732            call unswitch_cent(n1dfft,max2o,m2o,n2,lot,n1,lzt,zw(1,1,inzee),zt(1,1,jeff))
1733          else
1734            call unswitchreal_cent(n1dfft,max2o,n2,lot,n1,lzt,zw(1,1,inzee),zt(1,1,jeff))
1735          end if
1736 
1737        end do ! j
1738 
1739        ! transform along x axis
1740        ! input:  R2,R1,R3,(Rp3)
1741        ! output: R2,G1,R3,(Rp3)
1742        lot=ncache/(4*n1)
1743 
1744        do j=1,m2oeff,lot
1745          ma=j
1746          mb=min(j+(lot-1),m2oeff)
1747          n1dfft=mb-ma+1
1748          i=1
1749          call fftstp(lzt,n1dfft,m1zt,lot,n1,zt(1,j,1),zw(1,1,1), &
1750 &                    ftrig1,after1(i),now1(i),before1(i),-1)
1751 
1752          inzee=1
1753          do i=2,ic1
1754            call fftstp(lot,n1dfft,n1,lot,n1,zw(1,1,inzee),zw(1,1,3-inzee), &
1755 &                       ftrig1,after1(i),now1(i),before1(i),-1)
1756            inzee=3-inzee
1757          end do
1758 
1759          ! input:  G2,G1,R3,Gp2,(Rp3)
1760          ! output: G1,G2,R3,Gp2,(Rp3)
1761          if (nproc_fft == 1) then
1762            call unmpiswitch_cent(j3,n1dfft,Jp2stf,J2stf,lot,max1o,md1,m1o,n1,&
1763 &           md2proc,nd3proc,nproc_fft,ioption,zw(1,1,inzee),zmpi2)
1764          else
1765            call unmpiswitch_cent(j3,n1dfft,Jp2stf,J2stf,lot,max1o,md1,m1o,n1,&
1766 &           md2proc,nd3proc,nproc_fft,ioption,zw(1,1,inzee),zmpi1)
1767          end if
1768        end do ! j
1769      end if
1770    end do
1771 
1772    ! Interprocessor data transposition
1773    ! input:  G1,G2,R3,Gp2,(Rp3)
1774    ! output: G1,G2,R3,Rp3,(Gp2)
1775    if (nproc_fft > 1) then
1776      call timab(544,1,tsec)
1777      call xmpi_alltoall(zmpi1,2*md1*md2proc*nd3proc, &
1778 &                       zmpi2,2*md1*md2proc*nd3proc,comm_fft,ierr)
1779      call timab(544,2,tsec)
1780    end if
1781 
1782    ! transform along z axis
1783    ! input: G1,G2,R3,(Gp2)
1784    lot=ncache/(4*n3)
1785    do j2=1,md2proc
1786      if (me_fft*md2proc+j2 <= m2oeff) then
1787        do i1=1,m1o,lot
1788          ma=i1
1789          mb=min(i1+(lot-1),m1o)
1790          n1dfft=mb-ma+1
1791 
1792          ! input:  G1,G2,R3,(Gp2)
1793          ! output: G1,R3,G2,(Gp2)
1794          call unscramble(i1,j2,lot,n1dfft,md1,n3,md2proc,nnd3,zmpi2,zw(1,1,1))
1795 
1796          inzee=1
1797          do i=1,ic3
1798            call fftstp(lot,n1dfft,n3,lot,n3,zw(1,1,inzee),zw(1,1,3-inzee), &
1799 &           ftrig3,after3(i),now3(i),before3(i),-1)
1800            inzee=3-inzee
1801          end do
1802 
1803          call unfill_cent(md1,md3,lot,n1dfft,max3o,m3o,n3,zw(1,1,inzee),zf(1,i1,1,j2,idat))
1804          ! output: G1,G3,G2,(Gp2)
1805        end do
1806      end if
1807    end do
1808 
1809    ! Complete missing values with complex conjugate
1810    ! Inverse of ix is located at nx+2-ix , except for ix=1, for which it is 1.
1811    if (icplexwf==1) then
1812      do i3=1,m3o
1813        i3inv=m3o+2-i3
1814        if (i3==1) i3inv=1
1815        if (m2oeff>1)then
1816          do i2=2,m2oeff
1817            i2inv=m2o+2-i2
1818            zf(1,1,i3inv,i2inv,idat)= zf(1,1,i3,i2,idat)
1819            zf(2,1,i3inv,i2inv,idat)=-zf(2,1,i3,i2,idat)
1820            do i1=2,m1o
1821              i1inv=m1o+2-i1
1822              zf(1,i1inv,i3inv,i2inv,idat)= zf(1,i1,i3,i2,idat)
1823              zf(2,i1inv,i3inv,i2inv,idat)=-zf(2,i1,i3,i2,idat)
1824            end do
1825          end do
1826        end if
1827      end do
1828    end if
1829 
1830  end do ! idat
1831 
1832  ABI_DEALLOCATE(btrig1)
1833  ABI_DEALLOCATE(ftrig1)
1834  ABI_DEALLOCATE(after1)
1835  ABI_DEALLOCATE(now1)
1836  ABI_DEALLOCATE(before1)
1837  ABI_DEALLOCATE(btrig2)
1838  ABI_DEALLOCATE(ftrig2)
1839  ABI_DEALLOCATE(after2)
1840  ABI_DEALLOCATE(now2)
1841  ABI_DEALLOCATE(before2)
1842  ABI_DEALLOCATE(btrig3)
1843  ABI_DEALLOCATE(ftrig3)
1844  ABI_DEALLOCATE(after3)
1845  ABI_DEALLOCATE(now3)
1846  ABI_DEALLOCATE(before3)
1847 
1848  ABI_DEALLOCATE(zmpi2)
1849  ABI_DEALLOCATE(zw)
1850  ABI_DEALLOCATE(zt)
1851  if (nproc_fft > 1)  then
1852    ABI_DEALLOCATE(zmpi1)
1853  end if
1854 
1855 end subroutine sg2002_applypot

m_sg2002/sg2002_applypot_many [ Functions ]

[ Top ] [ m_sg2002 ] [ Functions ]

NAME

  sg2002_applypot_many

FUNCTION

 Applies the local real space potential to multiple wavefunctions in Fourier space

INPUTS

   ZF: Wavefunction (input/output) (note the switch of i2 and i3)
        real(F(i1,i3,i2,idat))=ZF(1,i1,i3,i2,idat)
        imag(F(i1,i3,i2,idat))=ZF(2,i1,i3,i2,idat)
   max1 is positive or zero ; m1 >=max1+1
   i1= 1... max1+1 corresponds to positive and zero wavevectors 0 ... max1
   then, if m1 > max1+1, one has min1=max1-m1+1 and
   i1= max1+2 ... m1 corresponds to negative wavevectors min1 ... -1
   i2 and i3 have a similar definition of range
   idat=1,ndat
   md1,md2,md3: Dimension of ZF (input as well as output), distributed on different procs
   md2proc=((md2-1)/nproc_fft)+1  maximal number of small box 2nd dim slices for one proc

   POT: Potential
        POT(cplex*i1,i2,i3)
        cplex=1 or 2 ,  i1=1,n1 , i2=1,n2 , i3=1,n3
   nd1,nd2,nd3: dimension of pot
   comm_fft: MPI communicator
   nproc_fft: number of processors used as returned by MPI_COMM_SIZE
   me_fft: [0:nproc_fft-1] number of processor as returned by MPI_COMM_RANK
    n1,n2,n3: logical dimension of the transform. As transform lengths
              most products of the prime factors 2,3,5 are allowed.
             The detailed table with allowed transform lengths can
             be found in subroutine CTRIG

 NOTES:
   PERFORMANCE CONSIDERATIONS:
   The maximum number of processors that can reasonably be used is max(n2/2,n3/2)

   It is very important to find the optimal
   value of NCACHE. NCACHE determines the size of the work array ZW, that
   has to fit into cache. It has therefore to be chosen to equal roughly
    half the size of the physical cache in units of real*8 numbers.
   The optimal value of ncache can easily be determined by numerical
   experimentation. A too large value of ncache leads to a dramatic
   and sudden decrease of performance, a too small value to a to a
   slow and less dramatic decrease of performance. If NCACHE is set
   to a value so small, that not even a single one dimensional transform
   can be done in the workarray zw, the program stops with an error message.

PARENTS

      m_fft

CHILDREN

SOURCE

1915 subroutine sg2002_applypot_many(icplexwf,cplex,ndat,n1,n2,n3,nd1,nd2,nd3,nd3proc,&
1916 &  max1i,max2i,max3i,m1i,m2i,m3i,md1,md2proc,md3,&
1917 &  max1o,max2o,max3o,m1o,m2o,m3o,comm_fft,nproc_fft,me_fft,pot,zf)
1918 
1919 
1920 !This section has been created automatically by the script Abilint (TD).
1921 !Do not modify the following lines by hand.
1922 #undef ABI_FUNC
1923 #define ABI_FUNC 'sg2002_applypot_many'
1924 !End of the abilint section
1925 
1926  implicit none
1927 
1928 !Arguments ------------------------------------
1929  integer,intent(in) :: icplexwf,cplex,ndat,n1,n2,n3,nd1,nd2,nd3,nd3proc
1930  integer,intent(in) :: max1i,max2i,max3i,m1i,m2i,m3i,md1,md2proc,md3
1931  integer,intent(in) :: max1o,max2o,max3o,m1o,m2o,m3o,comm_fft,nproc_fft,me_fft
1932  real(dp),intent(in) :: pot(cplex*nd1,nd2,nd3)
1933  real(dp),intent(inout) :: zf(2,md1,md3,md2proc,ndat)
1934 
1935 !Local variables-------------------------------
1936 !scalars
1937  integer,parameter :: unused0=0
1938  integer :: i,j,i1,i2,i3,ic1,ic2,ic3,idat,ierr,inzee,j3glob
1939  integer :: ioption,j2,j3,lot,lzt,m1zt,ma,mb,n1dfft,nnd3
1940  integer :: m2eff,ncache,n1eff,i1inv,i2inv,i3inv,jeff,includelast,j2stb
1941  integer :: jx,j2stf,Jp2stb,Jp2stf,m2ieff,m2oeff
1942 !arrays
1943  integer :: requests(ndat)
1944  real(dp) :: tsec(2)
1945  real(dp), allocatable :: zt(:,:,:) ! work arrays for transpositions
1946  real(dp), allocatable :: zmpi1(:,:,:,:,:),zmpi2(:,:,:,:,:) ! work arrays for MPI
1947  real(dp), allocatable :: zw(:,:,:) ! cache work array
1948 ! FFT work arrays
1949  real(dp), allocatable, dimension(:,:) :: btrig1,btrig2,btrig3
1950  real(dp), allocatable, dimension(:,:) :: ftrig1,ftrig2,ftrig3
1951  integer, allocatable, dimension(:) :: after1,now1,before1,after2,now2,before2,after3,now3,before3
1952 
1953 ! *************************************************************************
1954 
1955  !ioption=0 ! This was in the old version.
1956  ioption=1 ! This one is needed to be compatible with paral_kgb
1957 
1958  ! call timab(541,1,tsec)
1959  ncache=4*max(n1,n2,n3,1024)
1960  if (ncache/(4*max(n1,n2,n3)) < 1) then
1961    write(std_out,*) &
1962 &    'ncache has to be enlarged to be able to hold at', &
1963 &    'least one 1-d FFT of each size even though this will', &
1964 &    'reduce the performance for shorter transform lengths'
1965    MSG_ERROR("Aborting now")
1966  end if
1967 
1968  ! Effective m1 and m2 (complex-to-complex or real-to-complex)
1969  n1eff=n1; m2ieff=m2i; m2oeff=m2o; m1zt=n1
1970  if (icplexwf==1) then
1971    n1eff=(n1+1)/2; m2ieff=m2i/2+1; m2oeff=m2o/2+1; m1zt=2*(n1/2+1)
1972  end if
1973 
1974  m2eff=max(m2ieff,m2oeff)
1975  lzt=m2eff
1976  if (mod(m2eff,2) == 0) lzt=lzt+1
1977  if (mod(m2eff,4) == 0) lzt=lzt+1
1978 
1979  ! maximal number of big box 3rd dim slices for all procs
1980  nnd3=nd3proc*nproc_fft
1981 
1982  ABI_ALLOCATE(btrig1,(2,n1))
1983  ABI_ALLOCATE(ftrig1,(2,n1))
1984  ABI_ALLOCATE(after1,(mdata))
1985  ABI_ALLOCATE(now1,(mdata))
1986  ABI_ALLOCATE(before1,(mdata))
1987  ABI_ALLOCATE(btrig2,(2,n2))
1988  ABI_ALLOCATE(ftrig2,(2,n2))
1989  ABI_ALLOCATE(after2,(mdata))
1990  ABI_ALLOCATE(now2,(mdata))
1991  ABI_ALLOCATE(before2,(mdata))
1992  ABI_ALLOCATE(btrig3,(2,n3))
1993  ABI_ALLOCATE(ftrig3,(2,n3))
1994  ABI_ALLOCATE(after3,(mdata))
1995  ABI_ALLOCATE(now3,(mdata))
1996  ABI_ALLOCATE(before3,(mdata))
1997 
1998  ABI_ALLOCATE(zw,(2,ncache/4,2))
1999  ABI_ALLOCATE(zt,(2,lzt,m1zt))
2000  ABI_ALLOCATE(zmpi2,(2,md1,md2proc,nnd3,ndat))
2001  if (nproc_fft > 1)  then
2002    ABI_ALLOCATE(zmpi1,(2,md1,md2proc,nnd3,ndat))
2003  end if
2004 
2005  call ctrig(n3,btrig3,after3,before3,now3,1,ic3)
2006  call ctrig(n1,btrig1,after1,before1,now1,1,ic1)
2007  call ctrig(n2,btrig2,after2,before2,now2,1,ic2)
2008 
2009  do j=1,n1
2010    ftrig1(1,j)= btrig1(1,j)
2011    ftrig1(2,j)=-btrig1(2,j)
2012  end do
2013  do j=1,n2
2014    ftrig2(1,j)= btrig2(1,j)
2015    ftrig2(2,j)=-btrig2(2,j)
2016  end do
2017  do j=1,n3
2018    ftrig3(1,j)= btrig3(1,j)
2019    ftrig3(2,j)=-btrig3(2,j)
2020  end do
2021 
2022  ! Here we take advantage of non-blocking IALLTOALL:
2023  ! Perform the first step of MPI-FFT for ndat wavefunctions.
2024  do idat=1,ndat
2025 
2026    !
2027    ! transform along z axis
2028    ! input: G1,G3,G2,(Gp2)
2029    lot=ncache/(4*n3)
2030    do j2=1,md2proc
2031      if (me_fft*md2proc+j2 <= m2ieff) then
2032        do i1=1,m1i,lot
2033          ma=i1
2034          mb=min(i1+(lot-1),m1i)
2035          n1dfft=mb-ma+1
2036 
2037          ! zero-pad n1dfft G_z lines
2038          ! input: G1,G3,G2,(Gp2)
2039          call fill_cent(md1,md3,lot,n1dfft,max3i,m3i,n3,zf(1,i1,1,j2,idat),zw(1,1,1))
2040 
2041          inzee=1
2042          do i=1,ic3
2043            call fftstp(lot,n1dfft,n3,lot,n3,zw(1,1,inzee),zw(1,1,3-inzee), &
2044 &                      btrig3,after3(i),now3(i),before3(i),1)
2045            inzee=3-inzee
2046          end do
2047 
2048          ! Local rotation.
2049          ! input:  G1,R3,G2,(Gp2)
2050          ! output: G1,G2,R3,(Gp2)
2051          call scramble(i1,j2,lot,n1dfft,md1,n3,md2proc,nnd3,zw(1,1,inzee),zmpi2(:,:,:,:,idat))
2052        end do
2053      end if
2054    end do
2055 
2056    ! Interprocessor data transposition
2057    ! input:  G1,G2,R3,Rp3,(Gp2)
2058    ! output: G1,G2,R3,Gp2,(Rp3)
2059    if (nproc_fft > 1) then
2060       call timab(543,1,tsec)
2061       call xmpi_ialltoall(zmpi2(:,:,:,:,idat),2*md1*md2proc*nd3proc,&
2062 &                         zmpi1(:,:,:,:,idat),2*md1*md2proc*nd3proc,comm_fft,requests(idat))
2063       call timab(543,2,tsec)
2064    end if
2065  end do ! idat
2066 
2067  ! The second step of MPI-FFT
2068  do idat=1,ndat
2069     ! Make sure communication is completed.
2070     if (nproc_fft>1) call xmpi_wait(requests(idat),ierr)
2071 
2072    do j3=1,nd3proc
2073      j3glob = j3 + me_fft*nd3proc
2074 
2075      if (me_fft*nd3proc+j3 <= n3) then
2076        Jp2stb=1; J2stb=1
2077        Jp2stf=1; J2stf=1
2078 
2079        ! transform along x axis
2080        lot=ncache/(4*n1)
2081 
2082        do j=1,m2ieff,lot
2083          ma=j
2084          mb=min(j+(lot-1),m2ieff)
2085          n1dfft=mb-ma+1
2086 
2087          ! Zero-pad input.
2088          ! input:  G1,G2,R3,G2,(Rp3)
2089          ! output: G2,G1,R3,G2,(Rp3)
2090          if (nproc_fft == 1) then
2091            call mpiswitch_cent(j3,n1dfft,Jp2stb,J2stb,lot,max1i,md1,m1i,n1,&
2092 &           md2proc,nd3proc,nproc_fft,ioption,zmpi2(:,:,:,:,idat),zw(1,1,1), unused0, unused0, unused0)
2093          else
2094            call mpiswitch_cent(j3,n1dfft,Jp2stb,J2stb,lot,max1i,md1,m1i,n1,&
2095 &           md2proc,nd3proc,nproc_fft,ioption,zmpi1(:,:,:,:,idat),zw(1,1,1), unused0, unused0, unused0)
2096          end if
2097 
2098          ! Transform along x
2099          ! input:  G2,G1,R3,(Rp3)
2100          ! output: G2,R1,R3,(Rp3)
2101          inzee=1
2102          do i=1,ic1-1
2103            call fftstp(lot,n1dfft,n1,lot,n1,zw(1,1,inzee),zw(1,1,3-inzee), &
2104 &                     btrig1,after1(i),now1(i),before1(i),1)
2105            inzee=3-inzee
2106          end do
2107 
2108          i=ic1
2109          call fftstp(lot,n1dfft,n1,lzt,m1zt,zw(1,1,inzee),zt(1,j,1), &
2110 &                    btrig1,after1(i),now1(i),before1(i),1)
2111        end do
2112 
2113        ! Transform along y axis (take into account c2c or c2r case).
2114        ! Must loop over the full box.
2115        lot=ncache/(4*n2)
2116 
2117        if (icplexwf==1) then
2118          if(mod(lot,2).ne.0)lot=lot-1 ! needed to introduce jeff
2119        end if
2120 
2121        do j=1,n1eff,lot
2122          ma=j
2123          mb=min(j+(lot-1),n1eff)
2124          n1dfft=mb-ma+1
2125          jeff=j
2126          includelast=1
2127 
2128          if (icplexwf==1) then
2129            jeff=2*j-1
2130            includelast=1
2131            if (mb==n1eff .and. n1eff*2/=n1) includelast=0
2132          end if
2133 
2134          ! Zero-pad the input.
2135          !  input: G2,R1,R3,(Rp3)
2136          ! output: R1,G2,R3,(Rp3)
2137          if (icplexwf==2) then
2138            call switch_cent(n1dfft,max2i,m2i,n2,lot,n1,lzt,zt(1,1,jeff),zw(1,1,1))
2139          else
2140            call switchreal_cent(includelast,n1dfft,max2i,n2,lot,m1zt,lzt,zt(1,1,jeff),zw(1,1,1))
2141          end if
2142 
2143          ! input:  R1,G2,R3,(Rp3)
2144          ! output: R1,R2,R3,(Rp3)
2145          inzee=1
2146          do i=1,ic2
2147            call fftstp(lot,n1dfft,n2,lot,n2,zw(1,1,inzee),zw(1,1,3-inzee), &
2148 &                       btrig2,after2(i),now2(i),before2(i),1)
2149             inzee=3-inzee
2150          end do
2151          ! output: R1,R2,R3,(Rp3)
2152 
2153          ! Multiply with potential in real space
2154          jx=cplex*(jeff-1)+1
2155          call multpot(icplexwf,cplex,includelast,nd1,nd2,n2,lot,n1dfft,pot(jx,1,j3glob),zw(1,1,inzee))
2156 
2157          ! TRANSFORM BACK IN FOURIER SPACE
2158          ! transform along y axis
2159          ! input: R1,R2,R3,(Rp3)
2160          do i=1,ic2
2161            call fftstp(lot,n1dfft,n2,lot,n2,zw(1,1,inzee),zw(1,1,3-inzee), &
2162 &                       ftrig2,after2(i),now2(i),before2(i),-1)
2163            inzee=3-inzee
2164          end do
2165 
2166          !  input: R1,G2,R3,(Rp3)
2167          ! output: G2,R1,R3,(Rp3)
2168          if (icplexwf==2) then
2169            call unswitch_cent(n1dfft,max2o,m2o,n2,lot,n1,lzt,zw(1,1,inzee),zt(1,1,jeff))
2170          else
2171            call unswitchreal_cent(n1dfft,max2o,n2,lot,n1,lzt,zw(1,1,inzee),zt(1,1,jeff))
2172          end if
2173 
2174        end do ! j
2175 
2176        ! transform along x axis
2177        ! input:  R2,R1,R3,(Rp3)
2178        ! output: R2,G1,R3,(Rp3)
2179        lot=ncache/(4*n1)
2180 
2181        do j=1,m2oeff,lot
2182          ma=j
2183          mb=min(j+(lot-1),m2oeff)
2184          n1dfft=mb-ma+1
2185          i=1
2186          call fftstp(lzt,n1dfft,m1zt,lot,n1,zt(1,j,1),zw(1,1,1), &
2187 &                    ftrig1,after1(i),now1(i),before1(i),-1)
2188 
2189          inzee=1
2190          do i=2,ic1
2191            call fftstp(lot,n1dfft,n1,lot,n1,zw(1,1,inzee),zw(1,1,3-inzee), &
2192 &                       ftrig1,after1(i),now1(i),before1(i),-1)
2193            inzee=3-inzee
2194          end do
2195 
2196          ! input:  G2,G1,R3,Gp2,(Rp3)
2197          ! output: G1,G2,R3,Gp2,(Rp3)
2198          if (nproc_fft == 1) then
2199            call unmpiswitch_cent(j3,n1dfft,Jp2stf,J2stf,lot,max1o,md1,m1o,n1,&
2200 &           md2proc,nd3proc,nproc_fft,ioption,zw(1,1,inzee),zmpi2(:,:,:,:,idat))
2201          else
2202            call unmpiswitch_cent(j3,n1dfft,Jp2stf,J2stf,lot,max1o,md1,m1o,n1,&
2203 &           md2proc,nd3proc,nproc_fft,ioption,zw(1,1,inzee),zmpi1(:,:,:,:,idat))
2204          end if
2205        end do ! j
2206      end if
2207    end do
2208 
2209    ! Interprocessor data transposition
2210    ! input:  G1,G2,R3,Gp2,(Rp3)
2211    ! output: G1,G2,R3,Rp3,(Gp2)
2212    if (nproc_fft > 1) then
2213      call timab(544,1,tsec)
2214      call xmpi_ialltoall(zmpi1(:,:,:,:,idat),2*md1*md2proc*nd3proc, &
2215 &                        zmpi2(:,:,:,:,idat),2*md1*md2proc*nd3proc,comm_fft,requests(idat))
2216      call timab(544,2,tsec)
2217    end if
2218  end do ! idat
2219 
2220  do idat=1,ndat
2221    if (nproc_fft>1) call xmpi_wait(requests(idat),ierr)
2222 
2223    ! transform along z axis
2224    ! input: G1,G2,R3,(Gp2)
2225    lot=ncache/(4*n3)
2226    do j2=1,md2proc
2227      if (me_fft*md2proc+j2 <= m2oeff) then
2228        do i1=1,m1o,lot
2229          ma=i1
2230          mb=min(i1+(lot-1),m1o)
2231          n1dfft=mb-ma+1
2232 
2233          ! input:  G1,G2,R3,(Gp2)
2234          ! output: G1,R3,G2,(Gp2)
2235          call unscramble(i1,j2,lot,n1dfft,md1,n3,md2proc,nnd3,zmpi2(:,:,:,:,idat),zw(1,1,1))
2236 
2237          inzee=1
2238          do i=1,ic3
2239            call fftstp(lot,n1dfft,n3,lot,n3,zw(1,1,inzee),zw(1,1,3-inzee), &
2240 &           ftrig3,after3(i),now3(i),before3(i),-1)
2241            inzee=3-inzee
2242          end do
2243 
2244          call unfill_cent(md1,md3,lot,n1dfft,max3o,m3o,n3,zw(1,1,inzee),zf(1,i1,1,j2,idat))
2245          ! output: G1,G3,G2,(Gp2)
2246        end do
2247      end if
2248    end do
2249 
2250    ! Complete missing values with complex conjugate
2251    ! Inverse of ix is located at nx+2-ix , except for ix=1, for which it is 1.
2252    if (icplexwf==1) then
2253      do i3=1,m3o
2254        i3inv=m3o+2-i3
2255        if (i3==1) i3inv=1
2256        if (m2oeff>1)then
2257          do i2=2,m2oeff
2258            i2inv=m2o+2-i2
2259            zf(1,1,i3inv,i2inv,idat)= zf(1,1,i3,i2,idat)
2260            zf(2,1,i3inv,i2inv,idat)=-zf(2,1,i3,i2,idat)
2261            do i1=2,m1o
2262              i1inv=m1o+2-i1
2263              zf(1,i1inv,i3inv,i2inv,idat)= zf(1,i1,i3,i2,idat)
2264              zf(2,i1inv,i3inv,i2inv,idat)=-zf(2,i1,i3,i2,idat)
2265            end do
2266          end do
2267        end if
2268      end do
2269    end if
2270 
2271  end do ! idat
2272 
2273  ABI_DEALLOCATE(btrig1)
2274  ABI_DEALLOCATE(ftrig1)
2275  ABI_DEALLOCATE(after1)
2276  ABI_DEALLOCATE(now1)
2277  ABI_DEALLOCATE(before1)
2278  ABI_DEALLOCATE(btrig2)
2279  ABI_DEALLOCATE(ftrig2)
2280  ABI_DEALLOCATE(after2)
2281  ABI_DEALLOCATE(now2)
2282  ABI_DEALLOCATE(before2)
2283  ABI_DEALLOCATE(btrig3)
2284  ABI_DEALLOCATE(ftrig3)
2285  ABI_DEALLOCATE(after3)
2286  ABI_DEALLOCATE(now3)
2287  ABI_DEALLOCATE(before3)
2288 
2289  ABI_DEALLOCATE(zmpi2)
2290  ABI_DEALLOCATE(zw)
2291  ABI_DEALLOCATE(zt)
2292  if (nproc_fft > 1)  then
2293    ABI_DEALLOCATE(zmpi1)
2294  end if
2295 
2296 end subroutine sg2002_applypot_many

m_sg2002/sg2002_back [ Functions ]

[ Top ] [ m_sg2002 ] [ Functions ]

NAME

  sg2002_back

FUNCTION

   CALCULATES THE DISCRETE FOURIER TRANSFORM  in parallel using MPI/OpenMP

   ZR(I1,I2,I3)= \sum_(j1,j2,j3) EXP(isign*i*2*pi*(j1*i1/n1+j2*i2/n2+j3*i3/n3)) ZF(j1,j3,j2)

 Adopt standard convention that isign=1 for backward transform

 INPUTS:
    cplex=1 for real --> complex, 2 for complex --> complex
    ZF: input array in G-space (note the switch of i2 and i3)

         real(F(i1,i3,i2,idat))=ZF(1,i1,i3,i2,idat)
         imag(F(i1,i3,i2,idat))=ZF(2,i1,i3,i2,idat)

         i1=1,n1 , i2=1,n2 , i3=1,n3 , idat=1,ndat
 OUTPUTS:
    ZR: output array in R space.

         ZR(1,i1,i2,i3,idat)=real(R(i1,i2,i3,idat))
         ZR(2,i1,i2,i3,idat)=imag(R(i1,i2,i3,idat))

         i1=1,n1 , i2=1,n2 , i3=1,n3 , idat=1,ndat

    nproc_fft: number of processors used as returned by MPI_COMM_SIZE
    me_fft: [0:nproc_fft-1] number of processor as returned by MPI_COMM_RANK
    n1,n2,n3: logical dimension of the transform. As transform lengths
              most products of the prime factors 2,3,5 are allowed.
              The detailed table with allowed transform lengths can
              be found in subroutine CTRIG
    nd1,nd2,nd3: Dimension of ZF and ZR
    nd2proc=((nd2-1)/nproc_fft)+1 maximal number of 2nd dim slices
    nd3proc=((nd3-1)/nproc_fft)+1 maximal number of 3rd dim slices

 NOTES:
   The maximum number of processors that can reasonably be used is max(n2,n3)
   It is very important to find the optimal
   value of NCACHE. NCACHE determines the size of the work array ZW, that
   has to fit into cache. It has therefore to be chosen to equal roughly
    half the size of the physical cache in units of real*8 numbers.
   The optimal value of ncache can easily be determined by numerical
   experimentation. A too large value of ncache leads to a dramatic
   and sudden decrease of performance, a too small value to a to a
   slow and less dramatic decrease of performance. If NCACHE is set
   to a value so small, that not even a single one dimensional transform
   can be done in the workarray zw, the program stops with an error message.

PARENTS

      ccfft,fourdp,m_sg2002

CHILDREN

SOURCE

126 subroutine sg2002_back(cplex,ndat,n1,n2,n3,nd1,nd2,nd3,nd1eff,nd2proc,nd3proc,option,zf,zr,comm_fft)
127 
128 
129 !This section has been created automatically by the script Abilint (TD).
130 !Do not modify the following lines by hand.
131 #undef ABI_FUNC
132 #define ABI_FUNC 'sg2002_back'
133 !End of the abilint section
134 
135  implicit none
136 
137 !Arguments ------------------------------------
138 ! real space input
139  integer,intent(in) :: cplex,ndat,n1,n2,n3,nd1,nd2,nd3,nd1eff,nd2proc,nd3proc,option,comm_fft
140  real(dp),intent(in) :: zf(2,nd1,nd3,nd2proc,ndat)
141  real(dp),intent(out) :: zr(2,nd1eff,nd2,nd3proc,ndat)
142 
143 !Local variables-------------------------------
144 !scalars
145  integer :: i,j,i1,ic1,ic2,ic3,idat,ierr,includelast,inzee,j2,j2st,j3,jeff,jp2st,lot,lzt
146  integer :: ma,mb,n1dfft,n1eff,n2eff,n1zt,ncache,nnd3,nproc_fft,me_fft
147  character(len=500) :: msg
148 !arrays
149  real(dp), allocatable :: zt(:,:,:)  ! work arrays for transpositions
150  real(dp), allocatable :: zmpi1(:,:,:,:),zmpi2(:,:,:,:) ! work arrays for MPI
151  real(dp), allocatable :: zw(:,:,:) ! cache work array
152  real(dp) :: tsec(2)
153 ! FFT work arrays
154  real(dp), allocatable, dimension(:,:) :: trig1,trig2,trig3
155  integer, allocatable, dimension(:) :: after1,now1,before1,after2,now2,before2,after3,now3,before3
156 
157 ! *************************************************************************
158 
159  nproc_fft = xmpi_comm_size(comm_fft); me_fft = xmpi_comm_rank(comm_fft)
160 
161  ! find cache size that gives optimal performance on machine
162  ncache=4*max(n1,n2,n3,1024)
163 
164  if (ncache/(4*max(n1,n2,n3))<1) then
165    write(msg,'(5a)') &
166 &    'ncache has to be enlarged to be able to hold at',ch10, &
167 &    'least one 1-d FFT of each size even though this will',ch10,&
168 &    'reduce the performance for shorter transform lengths'
169    MSG_ERROR(msg)
170  end if
171 
172 ! check input
173  if (nd1<n1 .or. nd2<n2 .or. nd3<n3) then
174    MSG_ERROR("nd1<n1 .or. nd2<n2 .or. nd3<n3")
175  end if
176 
177  ! Effective n1 and n2 (complex-to-complex or real-to-complex)
178  n1eff=n1; n2eff=n2; n1zt=n1
179  if (cplex==1) then
180    n1eff=(n1+1)/2 ; n2eff=n2/2+1 ; n1zt=2*(n1/2+1)
181  end if
182 
183  lzt=n2eff
184  if (mod(n2eff,2) == 0) lzt=lzt+1
185  if (mod(n2eff,4) == 0) lzt=lzt+1
186 
187 ! maximal number of big box 3rd dim slices for all procs
188  nnd3=nd3proc*nproc_fft
189 
190  ABI_ALLOCATE(trig1,(2,n1))
191  ABI_ALLOCATE(after1,(mdata))
192  ABI_ALLOCATE(now1,(mdata))
193  ABI_ALLOCATE(before1,(mdata))
194  ABI_ALLOCATE(trig2,(2,n2))
195  ABI_ALLOCATE(after2,(mdata))
196  ABI_ALLOCATE(now2,(mdata))
197  ABI_ALLOCATE(before2,(mdata))
198  ABI_ALLOCATE(trig3,(2,n3))
199  ABI_ALLOCATE(after3,(mdata))
200  ABI_ALLOCATE(now3,(mdata))
201  ABI_ALLOCATE(before3,(mdata))
202  ABI_ALLOCATE(zw,(2,ncache/4,2))
203  ABI_ALLOCATE(zt,(2,lzt,n1zt))
204  ABI_ALLOCATE(zmpi2,(2,n1,nd2proc,nnd3))
205  if (nproc_fft>1)  then
206    ABI_ALLOCATE(zmpi1,(2,n1,nd2proc,nnd3))
207  end if
208 
209  call ctrig(n3,trig3,after3,before3,now3,1,ic3)
210  call ctrig(n1,trig1,after1,before1,now1,1,ic1)
211  call ctrig(n2,trig2,after2,before2,now2,1,ic2)
212 
213 !DEBUG
214 ! write(std_out,'(a,3i4)' )'sg2002_back,zf n1,n2,n3',n1,n2,n3
215 ! write(std_out,'(a,3i4)' )'nd1,nd2,nd3proc',nd1,nd2,nd3proc
216 ! write(std_out,'(a,3i4)' )'m1,m2,m3',m1,m2,m3
217 ! write(std_out,'(a,3i4)' )'max1,max2,max3',max1,max2,max3
218 ! write(std_out,'(a,3i4)' )'md1,md2proc,md3',md1,md2proc,md3
219 ! write(std_out,'(a,3i4)' )'n1eff,m2eff,m1zt',n1eff,m2eff,m1zt
220 !ENDDEBUG
221 
222  do idat=1,ndat
223    ! transform along z axis
224    ! input: I1,I3,J2,(Jp2)
225    lot=ncache/(4*n3)
226 
227    do j2=1,nd2proc
228      if (me_fft*nd2proc+j2 <= n2eff) then
229 
230        do i1=1,n1,lot
231          ma=i1
232          mb=min(i1+(lot-1),n1)
233          n1dfft=mb-ma+1
234 
235          ! input: G1,G3,G2,(Gp2)
236          call fill(nd1,nd3,lot,n1dfft,n3,zf(1,i1,1,j2,idat),zw(1,1,1))
237 
238          inzee=1
239          do i=1,ic3
240            call fftstp(lot,n1dfft,n3,lot,n3,zw(1,1,inzee),zw(1,1,3-inzee), &
241 &                      trig3,after3(i),now3(i),before3(i),1)
242            inzee=3-inzee
243          end do
244 
245          ! input:  G1,R3,G2,(Gp2)
246          ! output: G1,G2,R3,(Gp2)
247          call scramble(i1,j2,lot,n1dfft,n1,n3,nd2proc,nd3,zw(1,1,inzee),zmpi2)
248        end do
249      end if
250    end do
251 
252    ! Interprocessor data transposition
253    ! input:  G1,G2,R3,Rp3,(Gp2)
254    ! output: G1,G2,G3,Gp2,(Rp3)
255    if (nproc_fft>1) then
256      call timab(543,1,tsec)
257      call xmpi_alltoall(zmpi2,2*n1*nd2proc*nd3proc, &
258 &                       zmpi1,2*n1*nd2proc*nd3proc,comm_fft,ierr)
259      call timab(543,2,tsec)
260    end if
261 
262    do j3=1,nd3proc
263      if (me_fft*nd3proc+j3 <= n3) then
264        Jp2st=1
265        J2st=1
266 
267        ! transform along x axis
268        lot=ncache/(4*n1)
269 
270        do j=1,n2eff,lot
271          ma=j
272          mb=min(j+(lot-1),n2eff)
273          n1dfft=mb-ma+1
274 
275          ! input:  G1,G2,R3,Gp2,(Rp3)
276          ! output: G2,G1,R3,Jp2,(Rp3)
277          if (nproc_fft == 1) then
278            call mpiswitch(j3,n1dfft,Jp2st,J2st,lot,n1,nd2proc,nd3proc,nproc_fft,option,zmpi2,zw(1,1,1))
279          else
280            call mpiswitch(j3,n1dfft,Jp2st,J2st,lot,n1,nd2proc,nd3proc,nproc_fft,option,zmpi1,zw(1,1,1))
281          end if
282 
283          ! input:  G2,G1,R3,(Rp3)
284          ! output: G2,R1,R3,(Rp3)
285          inzee=1
286          do i=1,ic1-1
287            call fftstp(lot,n1dfft,n1,lot,n1,zw(1,1,inzee),zw(1,1,3-inzee), &
288 &                       trig1,after1(i),now1(i),before1(i),1)
289            inzee=3-inzee
290          end do
291 
292          i=ic1
293          call fftstp(lot,n1dfft,n1,lzt,n1zt,zw(1,1,inzee),zt(1,j,1), &
294 &                    trig1,after1(i),now1(i),before1(i),1)
295        end do
296 
297        ! transform along y axis
298        lot=ncache/(4*n2)
299 
300        do j=1,n1eff,lot
301          ma=j
302          mb=min(j+(lot-1),n1eff)
303          n1dfft=mb-ma+1
304          includelast=1
305 
306          if (cplex==1) then
307           jeff=2*j-1
308           includelast=1
309           if (mb==n1eff .and. n1eff*2/=n1) includelast=0
310          end if
311 
312          ! input:  G2,R1,R3,(Rp3)
313          ! output: R1,G2,R3,(Rp3)
314          if (cplex==2) then
315            call switch(n1dfft,n2,lot,n1,lzt,zt(1,1,j),zw(1,1,1))
316          else
317            call switchreal(includelast,n1dfft,n2,n2eff,lot,n1zt,lzt,zt(1,1,jeff),zw(1,1,1))
318          end if
319 
320          inzee=1
321          do i=1,ic2-1
322            call fftstp(lot,n1dfft,n2,lot,n2,zw(1,1,inzee),zw(1,1,3-inzee), &
323 &                       trig2,after2(i),now2(i),before2(i),1)
324            inzee=3-inzee
325          end do
326 
327          i=ic2
328          call fftstp(lot,n1dfft,n2,nd1eff,nd2,zw(1,1,inzee),zr(1,j,1,j3,idat), &
329 &                    trig2,after2(i),now2(i),before2(i),1)
330 
331        end do
332        ! output: R1,R2,R3,(Rp3)
333 
334      end if
335    end do
336  end do ! idat
337 
338  ABI_DEALLOCATE(trig1)
339  ABI_DEALLOCATE(after1)
340  ABI_DEALLOCATE(now1)
341  ABI_DEALLOCATE(before1)
342  ABI_DEALLOCATE(trig2)
343  ABI_DEALLOCATE(after2)
344  ABI_DEALLOCATE(now2)
345  ABI_DEALLOCATE(before2)
346  ABI_DEALLOCATE(trig3)
347  ABI_DEALLOCATE(after3)
348  ABI_DEALLOCATE(now3)
349  ABI_DEALLOCATE(before3)
350  ABI_DEALLOCATE(zmpi2)
351  ABI_DEALLOCATE(zw)
352  ABI_DEALLOCATE(zt)
353  if (nproc_fft>1)  then
354    ABI_DEALLOCATE(zmpi1)
355  end if
356 
357 end subroutine sg2002_back

m_sg2002/sg2002_forw [ Functions ]

[ Top ] [ m_sg2002 ] [ Functions ]

NAME

  sg2002_forw

FUNCTION

   Adopt standard convention that isign=-1 for forward transform
   CALCULATES THE DISCRETE FOURIERTRANSFORM ZF(I1,I3,I2)=
   S_(j1,j2,j3) EXP(isign*i*2*pi*(j1*i1/n1+j2*i2/n2+j3*i3/n3)) ZR(j1,j2,j3)
   in parallel using MPI/OpenMP and BLAS library calls.

INPUTS

    ZR: input array
         ZR(1,i1,i2,i3,idat)=real(R(i1,i2,i3,idat))
         ZR(2,i1,i2,i3,idat)=imag(R(i1,i2,i3,idat))
         i1=1,n1 , i2=1,n2 , i3=1,n3 , idat=1,ndat
 OUTPUTS
    ZF: output array (note the switch of i2 and i3)
         real(F(i1,i3,i2,idat))=ZF(1,i1,i3,i2,idat)
         imag(F(i1,i3,i2,idat))=ZF(2,i1,i3,i2,idat)
         i1=1,n1 , i2=1,n2 , i3=1,n3 , idat=1,ndat
    nproc_fft: number of processors used as returned by MPI_COMM_SIZE
    me_fft: [0:nproc_fft-1] number of processor as returned by MPI_COMM_RANK
     n1,n2,n3: logical dimension of the transform. As transform lengths
               most products of the prime factors 2,3,5 are allowed.
              The detailed table with allowed transform lengths can
              be found in subroutine CTRIG
     nd1,nd2,nd3: Dimension of ZR and ZF
    nd2proc=((nd2-1)/nproc_fft)+1 maximal number of 2nd dim slices
    nd3proc=((nd3-1)/nproc_fft)+1 maximal number of 3rd dim slices

NOTES

  SHOULD describe nd1eff
  SHOULD put cplex and nd1eff in OMP declarations
  SHOULD describe the change of value of nd2prod

  The maximum number of processors that can reasonably be used is max(n2,n3)

  It is very important to find the optimal
  value of NCACHE. NCACHE determines the size of the work array ZW, that
  has to fit into cache. It has therefore to be chosen to equal roughly
   half the size of the physical cache in units of real*8 numbers.
  The optimal value of ncache can easily be determined by numerical
  experimentation. A too large value of ncache leads to a dramatic
  and sudden decrease of performance, a too small value to a to a
  slow and less dramatic decrease of performance. If NCACHE is set
  to a value so small, that not even a single one dimensional transform
  can be done in the workarray zw, the program stops with an error message.

PARENTS

      ccfft,fourdp,m_sg2002

CHILDREN

SOURCE

417 subroutine sg2002_forw(cplex,ndat,n1,n2,n3,nd1,nd2,nd3,nd1eff,nd2proc,nd3proc,option,zr,zf,comm_fft)
418 
419 
420 !This section has been created automatically by the script Abilint (TD).
421 !Do not modify the following lines by hand.
422 #undef ABI_FUNC
423 #define ABI_FUNC 'sg2002_forw'
424 !End of the abilint section
425 
426  implicit none
427 
428 !Arguments ------------------------------------
429 !scalars
430  integer,intent(in) :: cplex,comm_fft
431  integer,intent(in) :: ndat,n1,n2,n3,nd1,nd2,nd3,nd1eff,nd2proc,nd3proc,option
432 !arrays
433  real(dp),intent(in) :: zr(2,nd1eff,nd2,nd3proc,ndat)
434  real(dp),intent(out) :: zf(2,nd1,nd3,nd2proc,ndat)
435 
436 !Local variables-------------------------------
437 !scalars
438  integer :: i,j,i1,ic1,ic2,ic3,idat,ierr,inzee,j2,j2st,j3,jp2st,lot,lzt
439  integer :: ma,mb,n1dfft,n1eff,n2eff,n1zt,ncache,nnd3,nproc_fft,me_fft
440  character(len=500) :: msg
441 !arrays
442  real(dp), allocatable :: zt(:,:,:) ! work arrays for transpositions
443  real(dp), allocatable :: zmpi1(:,:,:,:),zmpi2(:,:,:,:) ! work arrays for MPI
444  real(dp), allocatable :: zw(:,:,:) ! cache work array
445  real(dp) :: tsec(2)
446 ! FFT work arrays
447  real(dp), allocatable, dimension(:,:) :: trig1,trig2,trig3
448  integer, allocatable, dimension(:) :: after1,now1,before1,after2,now2,before2,after3,now3,before3
449 
450 ! *************************************************************************
451 
452  nproc_fft = xmpi_comm_size(comm_fft); me_fft = xmpi_comm_rank(comm_fft)
453 
454  ! find cache size that gives optimal performance on machine
455  ncache=4*max(n1,n2,n3,1024)
456  if (ncache/(4*max(n1,n2,n3))<1) then
457    write(msg,'(5a)')&
458 &     'ncache has to be enlarged to be able to hold at',ch10, &
459 &     'least one 1-d FFT of each size even though this will',ch10,&
460 &     'reduce the performance for shorter transform lengths'
461    MSG_ERROR(msg)
462  end if
463 
464  ! check input
465  if (nd1<n1 .or. nd2<n2 .or. nd3<n3) then
466    MSG_ERROR("nd1<n1 .or. nd2<n2 .or. nd3<n3")
467  end if
468 
469 !Effective n1 and n2 (complex-to-complex or real-to-complex)
470  n1eff=n1; n2eff=n2; n1zt=n1
471  if (cplex==1) then
472    n1eff=(n1+1)/2; n2eff=n2/2+1; n1zt=2*(n1/2+1)
473  end if
474 
475  lzt=n2eff
476  if (mod(n2eff,2) == 0) lzt=lzt+1
477  if (mod(n2eff,4) == 0) lzt=lzt+1
478 
479  ! maximal number of big box 3rd dim slices for all procs
480  nnd3=nd3proc*nproc_fft
481 
482  ABI_ALLOCATE(trig1,(2,n1))
483  ABI_ALLOCATE(after1,(mdata))
484  ABI_ALLOCATE(now1,(mdata))
485  ABI_ALLOCATE(before1,(mdata))
486  ABI_ALLOCATE(trig2,(2,n2))
487  ABI_ALLOCATE(after2,(mdata))
488  ABI_ALLOCATE(now2,(mdata))
489  ABI_ALLOCATE(before2,(mdata))
490  ABI_ALLOCATE(trig3,(2,n3))
491  ABI_ALLOCATE(after3,(mdata))
492  ABI_ALLOCATE(now3,(mdata))
493  ABI_ALLOCATE(before3,(mdata))
494  ABI_ALLOCATE(zw,(2,ncache/4,2))
495  ABI_ALLOCATE(zt,(2,lzt,n1zt))
496  ABI_ALLOCATE(zmpi2,(2,n1,nd2proc,nnd3))
497  if (nproc_fft>1)  then
498    ABI_ALLOCATE(zmpi1,(2,n1,nd2proc,nnd3))
499  end if
500 
501  call ctrig(n2,trig2,after2,before2,now2,-1,ic2)
502  call ctrig(n1,trig1,after1,before1,now1,-1,ic1)
503  call ctrig(n3,trig3,after3,before3,now3,-1,ic3)
504 
505  do idat=1,ndat
506    do j3=1,nd3proc
507      if (me_fft*(nd3proc)+j3 <= n3) then
508        Jp2st=1; J2st=1
509 
510        ! transform along y axis
511        ! input: R1,R2,R3,(Rp3)
512        lot=ncache/(4*n2)
513 
514        do j=1,n1eff,lot
515          ma=j
516          mb=min(j+(lot-1),n1eff)
517          n1dfft=mb-ma+1
518          i=1
519          call fftstp(nd1eff,n1dfft,nd2,lot,n2,zr(1,j,1,j3,idat),zw(1,1,1), &
520 &                    trig2,after2(i),now2(i),before2(i),-1)
521 
522          inzee=1
523          do i=2,ic2
524            call fftstp(lot,n1dfft,n2,lot,n2,zw(1,1,inzee),zw(1,1,3-inzee), &
525 &                       trig2,after2(i),now2(i),before2(i),-1)
526             inzee=3-inzee
527          end do
528 
529          !  input: R1,G2,R3,(Rp3)
530          ! output: G2,R1,R3,(Rp3)
531          if(cplex==2)then
532            call unswitch(n1dfft,n2,lot,n1zt,lzt,zw(1,1,inzee),zt(1,1,j))
533          else
534            call unswitchreal(n1dfft,n2,n2eff,lot,n1zt,lzt,zw(1,1,inzee),zt(1,1,2*j-1))
535          end if
536        end do
537 
538        ! transform along x axis
539        ! input: G2,R1,R3,(Rp3)
540        lot=ncache/(4*n1)
541 
542        do j=1,n2eff,lot
543          ma=j
544          mb=min(j+(lot-1),n2eff)
545          n1dfft=mb-ma+1
546 
547          i=1
548          call fftstp(lzt,n1dfft,n1zt,lot,n1,zt(1,j,1),zw(1,1,1), &
549 &                    trig1,after1(i),now1(i),before1(i),-1)
550 
551          inzee=1
552          do i=2,ic1
553            call fftstp(lot,n1dfft,n1,lot,n1,zw(1,1,inzee),zw(1,1,3-inzee), &
554 &                     trig1,after1(i),now1(i),before1(i),-1)
555            inzee=3-inzee
556          end do
557          ! output: G2,G1,R3,(Rp3)
558 
559          ! input:  G2,G1,R3,Gp2,(Rp3)
560          ! output: G1,G2,R3,Gp2,(Rp3)
561          ! write(std_out,*) 'J2st,Jp2st',J2st,Jp2st
562          if (nproc_fft == 1) then
563            call unmpiswitch(j3,n1dfft,Jp2st,J2st,lot,n1,nd2proc,nd3proc,nproc_fft,option,zw(1,1,inzee),zmpi2)
564          else
565            call unmpiswitch(j3,n1dfft,Jp2st,J2st,lot,n1,nd2proc,nd3proc,nproc_fft,option,zw(1,1,inzee),zmpi1)
566          end if
567        end do
568 
569      end if
570    end do ! j3
571 
572    ! Interprocessor data transposition
573    ! input:  G1,G2,R3,Gp2,(Rp3)
574    ! output: G1,G2,R3,Rp3,(Gp2)
575    if (nproc_fft>1) then
576      call timab(544,1,tsec)
577      call xmpi_alltoall(zmpi1,2*n1*nd2proc*nd3proc, &
578 &                       zmpi2,2*n1*nd2proc*nd3proc,comm_fft,ierr)
579      call timab(544,2,tsec)
580    end if
581 
582    ! transform along z axis
583    ! input: G1,G2,R3,(Gp2)
584    lot=ncache/(4*n3)
585 
586    do j2=1,nd2proc
587      if (me_fft*(nd2proc)+j2 <= n2eff) then
588        do i1=1,n1,lot
589          ma=i1
590          mb=min(i1+(lot-1),n1)
591          n1dfft=mb-ma+1
592 
593          ! input:  G1,G2,R3,(Gp2)
594          ! output: G1,R3,G2,(Gp2)
595          call unscramble(i1,j2,lot,n1dfft,n1,n3,nd2proc,nd3,zmpi2,zw(1,1,1))
596 
597          inzee=1
598          do i=1,ic3
599            call fftstp(lot,n1dfft,n3,lot,n3,zw(1,1,inzee),zw(1,1,3-inzee), &
600 &            trig3,after3(i),now3(i),before3(i),-1)
601            inzee=3-inzee
602          end do
603 
604          call unfill(nd1,nd3,lot,n1dfft,n3,zw(1,1,inzee),zf(1,i1,1,j2,idat))
605          ! output: G1,G3,G2,(Gp2)
606        end do
607      end if
608    end do
609 
610  end do ! idat
611 
612  ABI_DEALLOCATE(trig1)
613  ABI_DEALLOCATE(after1)
614  ABI_DEALLOCATE(now1)
615  ABI_DEALLOCATE(before1)
616  ABI_DEALLOCATE(trig2)
617  ABI_DEALLOCATE(after2)
618  ABI_DEALLOCATE(now2)
619  ABI_DEALLOCATE(before2)
620  ABI_DEALLOCATE(trig3)
621  ABI_DEALLOCATE(after3)
622  ABI_DEALLOCATE(now3)
623  ABI_DEALLOCATE(before3)
624  ABI_DEALLOCATE(zmpi2)
625  ABI_DEALLOCATE(zw)
626  ABI_DEALLOCATE(zt)
627  if (nproc_fft>1)  then
628    ABI_DEALLOCATE(zmpi1)
629  end if
630 
631 end subroutine sg2002_forw

m_sg2002/sg2002_mpiback_wf [ Functions ]

[ Top ] [ m_sg2002 ] [ Functions ]

NAME

  sg2002_mpiback_wf

FUNCTION

   Does multiple 3-dim backward FFTs from Fourier into real space
   Adopt standard convention that isign=1 for backward transform

   CALCULATES THE DISCRETE FOURIER TRANSFORM ZF(I1,I2,I3)=

   S_(j1,j2,j3) EXP(isign*i*2*pi*(j1*i1/n1+j2*i2/n2+j3*i3/n3)) ZF(j1,j3,j2)

   in parallel using MPI/OpenMP.

 INPUTS:
    icplexwf=1 if wavefunction is real, 2 if complex
    ndat=Number of wavefunctions to transform.
    n1,n2,n3: logical dimension of the transform. As transform lengths
              most products of the prime factors 2,3,5 are allowed.
              The detailed table with allowed transform lengths can be found in subroutine CTRIG
    nd1,nd2,nd3: Leading Dimension of ZR
    nd3proc=((nd3-1)/nproc_fft)+1 maximal number of big box 3rd dim slices for one proc
    max1 is positive or zero; m1 >=max1+1
      i1= 1... max1+1 corresponds to positive and zero wavevectors 0 ... max1
      then, if m1 > max1+1, one has min1=max1-m1+1 and
      i1= max1+2 ... m1 corresponds to negative wavevectors min1 ... -1
    max2 and max3 have a similar definition of range
    m1,m2,m3=Size of the box enclosing the G-sphere.
    md1,md2,md3: Dimension of ZF given on the **small** FFT box.
    md2proc=((md2-1)/nproc_fft)+1 maximal number of small box 2nd dim slices for one proc
    nproc_fft: number of processors used as returned by MPI_COMM_SIZE
    comm_fft=MPI communicator for the FFT.
    ZF: input array (note the switch of i2 and i3)
          real(F(i1,i3,i2,idat))=ZF(1,i1,i3,i2,idat)
          imag(F(i1,i3,i2,idat))=ZF(2,i1,i3,i2,idat)

 OUTPUTS
    ZR: output array
          ZR(1,i1,i2,i3,idat)=real(R(i1,i2,i3,idat))
          ZR(2,i1,i2,i3,idat)=imag(R(i1,i2,i3,idat))
        i1=1,n1 , i2=1,n2 , i3=1,n3 , idat=1,ndat

NOTES

   The maximum number of processors that can reasonably be used is max(n2/2,n3/2)

   It is very important to find the optimal
   value of NCACHE. NCACHE determines the size of the work array ZW, that
   has to fit into cache. It has therefore to be chosen to equal roughly
   half the size of the physical cache in units of real*8 numbers.
   The optimal value of ncache can easily be determined by numerical
   experimentation. A too large value of ncache leads to a dramatic
   and sudden decrease of performance, a too small value to a to a
   slow and less dramatic decrease of performance. If NCACHE is set
   to a value so small, that not even a single one dimensional transform
   can be done in the workarray zw, the program stops with an error message.

PARENTS

      m_fft

CHILDREN

SOURCE

699 subroutine sg2002_mpiback_wf(icplexwf,ndat,n1,n2,n3,nd1,nd2,nd3proc,&
700 &  max1,max2,max3,m1,m2,m3,md1,md2proc,md3,zf,zr,comm_fft)
701 
702 
703 !This section has been created automatically by the script Abilint (TD).
704 !Do not modify the following lines by hand.
705 #undef ABI_FUNC
706 #define ABI_FUNC 'sg2002_mpiback_wf'
707 !End of the abilint section
708 
709  implicit none
710 
711 !Arguments ------------------------------------
712  integer,intent(in) :: icplexwf,ndat,n1,n2,n3,nd1,nd2,nd3proc
713  integer,intent(in) :: max1,max2,max3,m1,m2,m3,md1,md2proc,md3,comm_fft
714  real(dp),intent(in) :: zf(2,md1,md3,md2proc,ndat)
715  real(dp),intent(out) :: zr(2,nd1,nd2,nd3proc,ndat)
716 
717 !Local variables-------------------------------
718  integer :: i,j,i1,i2,ic1,ic2,ic3,idat,ierr,inzee,includelast
719  integer :: ioption,j2,j3,j2st,jp2st,jeff,lot,lzt,m1zt,ma,mb,n1dfft,nnd3
720  integer :: m2eff,ncache,n1eff,n1half,nproc_fft,me_fft
721  character(len=500) :: msg
722 !arrays
723  real(dp),allocatable :: zt(:,:,:) ! work arrays for transpositions
724  real(dp),allocatable :: zmpi1(:,:,:,:,:),zmpi2(:,:,:,:,:)  ! work arrays for MPI
725  real(dp),allocatable :: zw(:,:,:) ! cache work array
726 ! FFT work arrays
727  real(dp),allocatable :: trig1(:,:),trig2(:,:),trig3(:,:)
728  integer,allocatable :: after1(:),now1(:),before1(:),after2(:)
729  integer,allocatable :: now2(:),before2(:),after3(:),now3(:),before3(:)
730  real(dp) :: tsec(2)
731 
732 ! *************************************************************************
733 
734  ! call timab(541,1,tsec)
735  ! FIXME must provide a default value but which one?
736  ! ioption = 0
737  ioption = 1
738  !if (paral_kgb==1) ioption=1
739 
740  nproc_fft = xmpi_comm_size(comm_fft); me_fft = xmpi_comm_rank(comm_fft)
741 
742  ! Find cache size that gives optimal performance on machine
743  ncache=4*max(n1,n2,n3,1024)
744  if (ncache/(4*max(n1,n2,n3))<1) then
745    write(msg,"(5a)") &
746 &    'ncache has to be enlarged to be able to hold at',ch10, &
747 &    'least one 1-d FFT of each size even though this will',ch10,&
748 &    'reduce the performance for shorter transform lengths'
749     MSG_ERROR(msg)
750  end if
751 
752  ! Effective m1 and m2 (complex-to-complex or real-to-complex)
753  n1eff=n1; m2eff=m2; m1zt=n1
754  if (icplexwf==1) then
755    n1eff=(n1+1)/2; m2eff=m2/2+1; m1zt=2*(n1/2+1)
756  end if
757 
758  lzt=m2eff
759  if (mod(m2eff,2)==0) lzt=lzt+1
760  if (mod(m2eff,4)==0) lzt=lzt+1
761 
762  ! maximal number of big box 3rd dim slices for all procs
763  nnd3=nd3proc*nproc_fft
764 
765  ABI_ALLOCATE(trig1,(2,n1))
766  ABI_ALLOCATE(after1,(mdata))
767  ABI_ALLOCATE(now1,(mdata))
768  ABI_ALLOCATE(before1,(mdata))
769  ABI_ALLOCATE(trig2,(2,n2))
770  ABI_ALLOCATE(after2,(mdata))
771  ABI_ALLOCATE(now2,(mdata))
772  ABI_ALLOCATE(before2,(mdata))
773  ABI_ALLOCATE(trig3,(2,n3))
774  ABI_ALLOCATE(after3,(mdata))
775  ABI_ALLOCATE(now3,(mdata))
776  ABI_ALLOCATE(before3,(mdata))
777 
778  ! Allocate cache work array and work arrays for MPI transpositions.
779  ABI_ALLOCATE(zw,(2,ncache/4,2))
780  ABI_ALLOCATE(zt,(2,lzt,m1zt))
781  ABI_ALLOCATE(zmpi2,(2,md1,md2proc,nnd3,ndat))
782  if (nproc_fft>1)  then
783    ABI_ALLOCATE(zmpi1,(2,md1,md2proc,nnd3,ndat))
784  end if
785 
786  ! Compute twiddle coefficients.
787  call ctrig(n3,trig3,after3,before3,now3,1,ic3)
788  call ctrig(n1,trig1,after1,before1,now1,1,ic1)
789  call ctrig(n2,trig2,after2,before2,now2,1,ic2)
790 
791 !DEBUG
792 ! write(std_out,'(2a,3i4)' )itoa(me_fft),': sg2002_mpiback_wf,zf n1,n2,n3',n1,n2,n3
793 ! write(std_out,'(2a,3i4)' )itoa(me_fft),': nd1,nd2,nd3proc',nd1,nd2,nd3proc
794 ! write(std_out,'(2a,3i4)' )itoa(me_fft),': m1,m2,m3',m1,m2,m3
795 ! write(std_out,'(2a,3i4)' )itoa(me_fft),': max1,max2,max3',max1,max2,max3
796 ! write(std_out,'(2a,3i4)' )itoa(me_fft),': md1,md2proc,md3',md1,md2proc,md3
797 ! write(std_out,'(2a,3i4)' )itoa(me_fft),'n1eff,m2eff,m1zt',n1eff,m2eff,m1zt
798 !ENDDEBUG
799 
800  do idat=1,ndat
801 
802     ! transform along z axis
803     ! input: G1,G3,G2,(Gp2)
804     lot=ncache/(4*n3)
805 
806     zw(:,:,:)=zero
807     zt(:,:,:)=zero
808 
809     ! Loop over the y planes treated by this node and trasform n1ddft G_z lines.
810     do j2=1,md2proc
811 
812       ! if (me_fft*md2proc+j2<=m2eff) then !a faire plus tard
813 
814       do i1=1,m1,lot
815         ma=i1
816         mb=min(i1+(lot-1),m1)
817         n1dfft=mb-ma+1
818 
819         ! zero-pad n1dfft G_z lines
820         ! input:  G1,G3,G2,(Gp2)
821         ! output: G1,R3,G2,(Gp2)
822         call fill_cent(md1,md3,lot,n1dfft,max3,m3,n3,zf(1,i1,1,j2,idat),zw(1,1,1))
823 
824         ! Transform along z.
825         inzee=1
826         do i=1,ic3
827           call fftstp(lot,n1dfft,n3,lot,n3,zw(1,1,inzee),zw(1,1,3-inzee), &
828 &                     trig3,after3(i),now3(i),before3(i),1)
829           inzee=3-inzee
830         end do
831 
832         ! Local rotation.
833         ! input:  G1,R3,G2,(Gp2)
834         ! output: G1,G2,R3,(Gp2)
835         call scramble(i1,j2,lot,n1dfft,md1,n3,md2proc,nnd3,zw(1,1,inzee),zmpi2(:,:,:,:,idat))
836       end do
837       !
838     end do ! j2
839 
840     ! Interprocessor data transposition
841     ! input:  G1,G2,R3,Rp3,(Gp2)
842     ! output: G1,G2,R3,Gp2,(Rp3)
843     if (nproc_fft>1) then
844       call timab(543,1,tsec)
845       call xmpi_alltoall(zmpi2(:,:,:,:,idat),2*md1*md2proc*nd3proc, &
846 &                        zmpi1(:,:,:,:,idat),2*md1*md2proc*nd3proc,comm_fft,ierr)
847       call timab(543,2,tsec)
848     end if
849 
850     ! Loop over the z treated by this node.
851     do j3=1,nd3proc
852       !j3glob = j3 + me_fft*nd3proc
853       if (me_fft*nd3proc+j3 <= n3) then
854         Jp2st=1; J2st=1
855 
856         lot=ncache/(4*n1)
857 
858         ! Loop over G_y in the small box.
859         do j=1,m2eff,lot
860           ma=j
861           mb=min(j+(lot-1),m2eff)
862           n1dfft=mb-ma+1
863 
864           ! Zero-pad input.
865           ! input:  G1,G2,R3,JG2,(Rp3)
866           ! output: G2,G1,R3,JG2,(Rp3)
867           if (nproc_fft==1) then
868             call mpiswitch_cent(j3,n1dfft,Jp2st,J2st,lot,max1,md1,m1,n1,&
869 &             md2proc,nd3proc,nproc_fft,ioption,zmpi2(:,:,:,:,idat),zw(1,1,1),max2,m2,n2)
870           else
871             call mpiswitch_cent(j3,n1dfft,Jp2st,J2st,lot,max1,md1,m1,n1,&
872 &             md2proc,nd3proc,nproc_fft,ioption,zmpi1(:,:,:,:,idat),zw(1,1,1),max2,m2,n2)
873           end if
874 
875           ! Transform along x
876           ! input:  G2,G1,R3,(Rp3)
877           ! output: G2,R1,R3,(Rp3)
878           inzee=1
879           do i=1,ic1-1
880             call fftstp(lot,n1dfft,n1,lot,n1,zw(1,1,inzee),zw(1,1,3-inzee), &
881 &                       trig1,after1(i),now1(i),before1(i),1)
882             inzee=3-inzee
883           end do
884 
885           i=ic1
886           call fftstp(lot,n1dfft,n1,lzt,m1zt,zw(1,1,inzee),zt(1,j,1), &
887 &                     trig1,after1(i),now1(i),before1(i),1)
888         end do
889 
890         ! Transform along y axis (take into account c2c or c2r case).
891         ! Must loop over the full box.
892         lot=ncache/(4*n2)
893 
894         do j=1,n1eff,lot
895           ma=j
896           mb=min(j+(lot-1),n1eff)
897           n1dfft=mb-ma+1
898           includelast=1
899           if (icplexwf==1) then
900             jeff=2*j-1
901             if (mb==n1eff .and. n1eff*2/=n1) includelast=0
902           end if
903 
904           ! Zero-pad the input.
905           ! input:  G2,R1,R3,(Rp3)
906           ! output: R1,G2,R3,(Rp3)
907           if (icplexwf==2) then
908             call switch_cent(n1dfft,max2,m2,n2,lot,n1,lzt,zt(1,1,j),zw(1,1,1))
909           else
910             call switchreal_cent(includelast,n1dfft,max2,n2,lot,m1zt,lzt,zt(1,1,jeff),zw(1,1,1))
911           end if
912 
913           ! input:  R1,G2,R3,(Rp3)
914           ! output: R1,R2,R3,(Rp3)
915           inzee=1
916           do i=1,ic2-1
917             call fftstp(lot,n1dfft,n2,lot,n2,zw(1,1,inzee),zw(1,1,3-inzee), &
918 &                        trig2,after2(i),now2(i),before2(i),1)
919             inzee=3-inzee
920           end do
921 
922           i=ic2
923 
924         call fftstp(lot,n1dfft,n2,nd1,nd2,zw(1,1,inzee),zr(1,j,1,j3,idat), &
925 &                     trig2,after2(i),now2(i),before2(i),1)
926 
927 
928         end do
929 
930         ! Treat real wavefunctions.
931         if (icplexwf==1) then
932           n1half=n1/2
933           ! If odd
934           if (n1half*2/=n1) then
935             do i2=1,n2
936               zr(1,n1,i2,j3,idat)=zr(1,n1eff,i2,j3,idat)
937               zr(2,n1,i2,j3,idat)=zero
938             end do
939           end if
940           do i2=1,n2
941             do i1=n1half,1,-1
942               zr(1,2*i1-1,i2,j3,idat)=zr(1,i1,i2,j3,idat)
943               zr(1,2*i1  ,i2,j3,idat)=zr(2,i1,i2,j3,idat)
944               zr(2,2*i1-1,i2,j3,idat)=zero
945               zr(2,2*i1  ,i2,j3,idat)=zero
946             end do
947           end do
948         end if
949 
950       end if
951 
952    end do ! j3
953  end do ! idat
954 
955  ABI_DEALLOCATE(trig1)
956  ABI_DEALLOCATE(after1)
957  ABI_DEALLOCATE(now1)
958  ABI_DEALLOCATE(before1)
959  ABI_DEALLOCATE(trig2)
960  ABI_DEALLOCATE(after2)
961  ABI_DEALLOCATE(now2)
962  ABI_DEALLOCATE(before2)
963  ABI_DEALLOCATE(trig3)
964  ABI_DEALLOCATE(after3)
965  ABI_DEALLOCATE(now3)
966  ABI_DEALLOCATE(before3)
967  ABI_DEALLOCATE(zmpi2)
968  ABI_DEALLOCATE(zw)
969  ABI_DEALLOCATE(zt)
970  if (nproc_fft>1)  then
971    ABI_DEALLOCATE(zmpi1)
972  end if
973 
974  !call timab(541,2,tsec)
975 
976 end subroutine sg2002_mpiback_wf

m_sg2002/sg2002_mpiforw_wf [ Functions ]

[ Top ] [ m_sg2002 ] [ Functions ]

NAME

  sg2002_mpiforw_wf

FUNCTION

   Does multiple 3-dim backward FFTs from real into Fourier space
   Adopt standard convention that isign=-1 for forward transform
   CALCULATES THE DISCRETE FOURIERTRANSFORM

   ZF(I1,I3,I2)=S_(j1,j2,j3) EXP(isign*i*2*pi*(j1*i1/n1+j2*i2/n2+j3*i3/n3)) ZR(j1,j2,j3)

   in parallel using MPI/OpenMP.

 INPUT:
   ZR: input array
        ZR(1,i1,i2,i3,idat)=real(R(i1,i2,i3,idat))
        ZR(2,i1,i2,i3,idat)=imag(R(i1,i2,i3,idat))
        i1=1,n1 , i2=1,n2 , i3=1,n3 , idat=1,ndat
   NOTE that ZR is changed by the routine

   n1,n2,n3: logical dimension of the transform. As transform lengths
             most products of the prime factors 2,3,5 are allowed.
             The detailed table with allowed transform lengths can
             be found in subroutine CTRIG
   nd1,nd2,nd3: Dimension of ZR
   nd3proc=((nd3-1)/nproc_fft)+1  maximal number of big box 3rd dim slices for one proc

 OUTPUT:
   ZF: output array (note the switch of i2 and i3)
        real(F(i1,i3,i2,idat))=ZF(1,i1,i3,i2,idat)
        imag(F(i1,i3,i2,idat))=ZF(2,i1,i3,i2,idat)
   max1 is positive or zero ; m1 >=max1+1
     i1= 1... max1+1 corresponds to positive and zero wavevectors 0 ... max1
     then, if m1 > max1+1, one has min1=max1-m1+1 and
     i1= max1+2 ... m1 corresponds to negative wavevectors min1 ... -1
     i2 and i3 have a similar definition of range
   idat=1,ndat
   md1,md2,md3: Dimension of ZF
   md2proc=((md2-1)/nproc_fft)+1  maximal number of small box 2nd dim slices for one proc
   nproc_fft: number of processors used as returned by MPI_COMM_SIZE
   me_fft: [0:nproc-1] rank of the processor in the FFT communicator.
   comm_fft=MPI communicator for parallel FFT.

NOTES

  The maximum number of processors that can reasonably be used is max(n2/2,n3/2)

  It is very important to find the optimal
  value of NCACHE. NCACHE determines the size of the work array ZW, that
  has to fit into cache. It has therefore to be chosen to equal roughly
   half the size of the physical cache in units of real*8 numbers.
  The optimal value of ncache can easily be determined by numerical
  experimentation. A too large value of ncache leads to a dramatic
  and sudden decrease of performance, a too small value to a to a
  slow and less dramatic decrease of performance. If NCACHE is set
  to a value so small, that not even a single one dimensional transform
  can be done in the workarray zw, the program stops with an error message.

PARENTS

      m_fft

CHILDREN

SOURCE

1045 subroutine sg2002_mpiforw_wf(icplexwf,ndat,n1,n2,n3,nd1,nd2,nd3proc,&
1046 &        max1,max2,max3,m1,m2,m3,md1,md2proc,md3,zr,zf,comm_fft)
1047 
1048 
1049 !This section has been created automatically by the script Abilint (TD).
1050 !Do not modify the following lines by hand.
1051 #undef ABI_FUNC
1052 #define ABI_FUNC 'sg2002_mpiforw_wf'
1053 !End of the abilint section
1054 
1055  implicit none
1056 
1057 !Arguments ------------------------------------
1058 !scalars
1059  integer,intent(in) :: icplexwf,ndat,n1,n2,n3,nd1,nd2,nd3proc
1060  integer,intent(in) :: max1,max2,max3,m1,m2,m3,md1,md2proc,md3,comm_fft
1061 !arrays
1062  real(dp),intent(inout) :: zr(2,nd1,nd2,nd3proc,ndat)
1063  real(dp),intent(out) :: zf(2,md1,md3,md2proc,ndat)
1064 
1065 !Local variables-------------------------------
1066 !scalars
1067  integer :: i,j,i1,i2,i3,ic1,ic2,ic3,idat,ierr,inzee,nproc_fft,me_fft
1068  integer :: ioption,j2,j3,j2st,jp2st,lot,lzt,m1zt,ma,mb,n1dfft,nnd3
1069  integer :: m2eff,ncache,n1eff,n1half,i1inv,i2inv,i3inv
1070  character(len=500) :: msg
1071 !arrays
1072  real(dp), allocatable :: zt(:,:,:) ! work arrays for transpositions
1073  real(dp), allocatable :: zmpi1(:,:,:,:,:),zmpi2(:,:,:,:,:) ! work arrays for MPI
1074  real(dp), allocatable :: zw(:,:,:) ! cache work array
1075 ! FFT work arrays
1076  real(dp), allocatable :: trig1(:,:),trig2(:,:),trig3(:,:)
1077  integer, allocatable :: after1(:),now1(:),before1(:),after2(:),now2(:),before2(:),after3(:),now3(:),before3(:)
1078  real(dp) :: tsec(2)
1079 
1080 ! *************************************************************************
1081 
1082  ! call timab(542,1,tsec)
1083 
1084  ! FIXME must provide a default value but which one?
1085  !ioption = 0
1086  ioption = 1
1087  !if (paral_kgb==1) ioption=1
1088 
1089  nproc_fft = xmpi_comm_size(comm_fft); me_fft = xmpi_comm_rank(comm_fft)
1090 
1091  ! find cache size that gives optimal performance on machine
1092  ncache=4*max(n1,n2,n3,1024)
1093  if (ncache/(4*max(n1,n2,n3))<1) then
1094    write(msg,'(5a)') &
1095 &    'ncache has to be enlarged to be able to hold at',ch10, &
1096 &    'least one 1-d FFT of each size even though this will',ch10,&
1097 &    'reduce the performance for shorter transform lengths'
1098    MSG_ERROR(msg)
1099  end if
1100 
1101  ! Effective m1 and m2 (complex-to-complex or real-to-complex)
1102  n1eff=n1; m2eff=m2; m1zt=n1
1103  if (icplexwf==1) then
1104    n1eff=(n1+1)/2; m2eff=m2/2+1; m1zt=2*(n1/2+1)
1105  end if
1106 
1107  lzt=m2eff
1108  if (mod(m2eff,2)==0) lzt=lzt+1
1109  if (mod(m2eff,4)==0) lzt=lzt+1
1110 
1111  ! maximal number of big box 3rd dim slices for all procs
1112  nnd3=nd3proc*nproc_fft
1113 
1114  ABI_ALLOCATE(trig1,(2,n1))
1115  ABI_ALLOCATE(after1,(mdata))
1116  ABI_ALLOCATE(now1,(mdata))
1117  ABI_ALLOCATE(before1,(mdata))
1118  ABI_ALLOCATE(trig2,(2,n2))
1119  ABI_ALLOCATE(after2,(mdata))
1120  ABI_ALLOCATE(now2,(mdata))
1121  ABI_ALLOCATE(before2,(mdata))
1122  ABI_ALLOCATE(trig3,(2,n3))
1123  ABI_ALLOCATE(after3,(mdata))
1124  ABI_ALLOCATE(now3,(mdata))
1125  ABI_ALLOCATE(before3,(mdata))
1126  ABI_ALLOCATE(zw,(2,ncache/4,2))
1127  ABI_ALLOCATE(zt,(2,lzt,m1zt))
1128  ABI_ALLOCATE(zmpi2,(2,md1,md2proc,nnd3,ndat))
1129  if (nproc_fft>1)  then
1130    ABI_ALLOCATE(zmpi1,(2,md1,md2proc,nnd3,ndat))
1131  end if
1132 
1133  call ctrig(n2,trig2,after2,before2,now2,-1,ic2)
1134  call ctrig(n1,trig1,after1,before1,now1,-1,ic1)
1135  call ctrig(n3,trig3,after3,before3,now3,-1,ic3)
1136 
1137 !DEBUG
1138 ! write(std_out,'(2a,3i4)' )itoa(me_fft),'sg2002_mpiforw_wf, enter', i1,i2,i3,zr,n1,n2,n3',n1,n2,n3
1139 ! write(std_out,'(2a,3i4)' )itoa(me_fft),'nd1,nd2,nd3proc',nd1,nd2,nd3proc
1140 ! write(std_out,'(2a,3i4)' )itoa(me_fft),'m1,m2,m3',m1,m2,m3
1141 ! write(std_out,'(2a,3i4)' )itoa(me_fft),'max1,max2,max3',max1,max2,max3
1142 ! write(std_out,'(2a,3i4)' )itoa(me_fft),'md1,md2proc,md3',md1,md2proc,md3
1143 ! write(std_out,'(2a,3i4)' )itoa(me_fft),'n1eff,m2eff,m1zt',n1eff,m2eff,m1zt
1144 !ENDDEBUG
1145 
1146   do idat=1,ndat
1147     ! Loop over the z-planes treated by this node
1148     do j3=1,nd3proc
1149 
1150        if (me_fft*nd3proc+j3 <= n3) then
1151          Jp2st=1
1152          J2st=1
1153 
1154          ! Treat real wavefunctions.
1155          if (icplexwf==1) then
1156            n1half=n1/2
1157            do i2=1,n2
1158              do i1=1,n1half
1159                zr(1,i1,i2,j3,idat)=zr(1,2*i1-1,i2,j3,idat)
1160                zr(2,i1,i2,j3,idat)=zr(1,2*i1  ,i2,j3,idat)
1161              end do
1162            end do
1163            ! If odd
1164            if(n1half*2/=n1)then
1165              do i2=1,n2
1166                zr(1,n1eff,i2,j3,idat)=zr(1,n1,i2,j3,idat)
1167                zr(2,n1eff,i2,j3,idat)=zero
1168              end do
1169            end if
1170          end if
1171 
1172          ! transform along y axis
1173          ! input: R1,R2,R3,(Rp3)
1174          ! input: R1,G2,R3,(Rp3)
1175          lot=ncache/(4*n2)
1176 
1177          do j=1,n1eff,lot
1178            ma=j
1179            mb=min(j+(lot-1),n1eff)
1180            n1dfft=mb-ma+1
1181            i=1
1182            call fftstp(nd1,n1dfft,nd2,lot,n2,zr(1,j,1,j3,idat),zw(1,1,1), &
1183 &                      trig2,after2(i),now2(i),before2(i),-1)
1184 
1185            inzee=1
1186            do i=2,ic2
1187              call fftstp(lot,n1dfft,n2,lot,n2,zw(1,1,inzee),zw(1,1,3-inzee), &
1188 &                         trig2,after2(i),now2(i),before2(i),-1)
1189               inzee=3-inzee
1190            end do
1191 
1192            ! input:  R1,G2,R3,(Rp3)
1193            ! output: G2,R1,R3,(Rp3)
1194            if(icplexwf==2)then
1195              call unswitch_cent(n1dfft,max2,m2,n2,lot,n1,lzt,zw(1,1,inzee),zt(1,1,j))
1196            else
1197              call unswitchreal_cent(n1dfft,max2,n2,lot,n1,lzt,zw(1,1,inzee),zt(1,1,2*j-1))
1198            end if
1199          end do
1200 
1201          ! transform along x axis
1202          ! input: G2,R1,R3,(Rp3)
1203          lot=ncache/(4*n1)
1204 
1205          do j=1,m2eff,lot
1206            ma=j
1207            mb=min(j+(lot-1),m2eff)
1208            n1dfft=mb-ma+1
1209            i=1
1210            call fftstp(lzt,n1dfft,m1zt,lot,n1,zt(1,j,1),zw(1,1,1), &
1211 &                      trig1,after1(i),now1(i),before1(i),-1)
1212 
1213            inzee=1
1214            do i=2,ic1
1215              call fftstp(lot,n1dfft,n1,lot,n1,zw(1,1,inzee),zw(1,1,3-inzee), &
1216 &                        trig1,after1(i),now1(i),before1(i),-1)
1217              inzee=3-inzee
1218            end do
1219            ! output: G2,G1,R3,(Rp3)
1220 
1221            ! input:  G2,G1,R3,Gp2,(Rp3)
1222            ! output: G1,G2,R3,Gp2,(Rp3)
1223            if (nproc_fft==1) then
1224              call unmpiswitch_cent(j3,n1dfft,Jp2st,J2st,lot,max1,md1,m1,n1,&
1225 &              md2proc,nd3proc,nproc_fft,ioption,zw(1,1,inzee),zmpi2(:,:,:,:,idat))
1226            else
1227              call unmpiswitch_cent(j3,n1dfft,Jp2st,J2st,lot,max1,md1,m1,n1,&
1228 &              md2proc,nd3proc,nproc_fft,ioption,zw(1,1,inzee),zmpi1(:,:,:,:,idat))
1229            end if
1230          end do
1231 
1232         end if
1233      end do ! j3
1234 
1235      ! Interprocessor data transposition
1236      ! input:  G1,G2,R3,Gp2,(Rp3)
1237      ! output: G1,G2,R3,Rp3,(Gp2)
1238      if (nproc_fft>1) then
1239         call timab(544,1,tsec)
1240         call xmpi_alltoall(zmpi1(:,:,:,:,idat),2*md1*md2proc*nd3proc, &
1241 &                          zmpi2(:,:,:,:,idat),2*md1*md2proc*nd3proc,comm_fft,ierr)
1242 
1243         call timab(544,2,tsec)
1244      end if
1245 
1246      ! transform along z axis
1247      ! input: G1,G2,R3,(Gp2)
1248      lot=ncache/(4*n3)
1249 
1250      do j2=1,md2proc
1251        if (me_fft*md2proc+j2 <= m2eff) then
1252          ! write(std_out,*)' forwf_wf : before unscramble, j2,md2proc,me_fft,m2=',j2,md2proc,me_fft,m2
1253          do i1=1,m1,lot
1254            ma=i1
1255            mb=min(i1+(lot-1),m1)
1256            n1dfft=mb-ma+1
1257 
1258            ! input:  G1,G2,R3,(Gp2)
1259            ! output: G1,R3,G2,(Gp2)
1260            call unscramble(i1,j2,lot,n1dfft,md1,n3,md2proc,nnd3,zmpi2(:,:,:,:,idat),zw(1,1,1))
1261 
1262            inzee=1
1263            do i=1,ic3
1264              call fftstp(lot,n1dfft,n3,lot,n3,zw(1,1,inzee),zw(1,1,3-inzee), &
1265 &              trig3,after3(i),now3(i),before3(i),-1)
1266              inzee=3-inzee
1267            end do
1268 
1269            call unfill_cent(md1,md3,lot,n1dfft,max3,m3,n3,zw(1,1,inzee),zf(1,i1,1,j2,idat))
1270            ! output: G1,G3,G2,(Gp2)
1271          end do
1272        end if
1273      end do
1274 
1275      if (icplexwf==1) then
1276        ! Complete missing values with complex conjugate
1277        ! Inverse of ix is located at nx+2-ix , except for ix=1, for which it is 1.
1278        do i3=1,m3
1279          i3inv=m3+2-i3
1280          if(i3==1)i3inv=1
1281 
1282          if (m2eff>1) then
1283            do i2=2,m2eff
1284              i2inv=m2+2-i2
1285              zf(1,1,i3inv,i2inv,idat)= zf(1,1,i3,i2,idat)
1286              zf(2,1,i3inv,i2inv,idat)=-zf(2,1,i3,i2,idat)
1287              do i1=2,m1
1288                i1inv=m1+2-i1
1289                zf(1,i1inv,i3inv,i2inv,idat)= zf(1,i1,i3,i2,idat)
1290                zf(2,i1inv,i3inv,i2inv,idat)=-zf(2,i1,i3,i2,idat)
1291              end do
1292            end do
1293          end if
1294        end do
1295      end if
1296 
1297  end do ! idat
1298 
1299  ABI_DEALLOCATE(trig1)
1300  ABI_DEALLOCATE(after1)
1301  ABI_DEALLOCATE(now1)
1302  ABI_DEALLOCATE(before1)
1303  ABI_DEALLOCATE(trig2)
1304  ABI_DEALLOCATE(after2)
1305  ABI_DEALLOCATE(now2)
1306  ABI_DEALLOCATE(before2)
1307  ABI_DEALLOCATE(trig3)
1308  ABI_DEALLOCATE(after3)
1309  ABI_DEALLOCATE(now3)
1310  ABI_DEALLOCATE(before3)
1311  ABI_DEALLOCATE(zmpi2)
1312  ABI_DEALLOCATE(zw)
1313  ABI_DEALLOCATE(zt)
1314  if (nproc_fft>1)  then
1315    ABI_DEALLOCATE(zmpi1)
1316  end if
1317 
1318  !call timab(542,2,tsec)
1319 
1320 end subroutine sg2002_mpiforw_wf

m_sg2002/sg2002_mpifourdp [ Functions ]

[ Top ] [ m_sg2002 ] [ Functions ]

NAME

 sg2002_mpifourdp

FUNCTION

 Conduct Fourier transform of REAL or COMPLEX function f(r)=fofr defined on
 fft grid in real space, to create complex f(G)=fofg defined on full fft grid
 in reciprocal space, in full storage mode, or the reverse operation.
 For the reverse operation, the final data is divided by nfftot.
 REAL case when cplex=1, COMPLEX case when cplex=2
 Usually used for density and potentials.

INPUTS

 cplex=1 if fofr is real, 2 if fofr is complex
 nfft=(effective) number of FFT grid points (for this processor)
 ngfft(18)=contain all needed information about 3D FFT, see ~abinit/doc/variables/vargs.htm#ngfft
 ndat=Numbre of FFT transforms
 isign=sign of Fourier transform exponent: current convention uses
    +1 for transforming from G to r
    -1 for transforming from r to G.
 fftn2_distrib(2),ffti2_local(2)
 fftn3_distrib(3),ffti3_local(3)
 comm_fft=MPI communicator

SIDE EFFECTS

 Input/Output
 fofg(2,nfft)=f(G), complex.
 fofr(cplex*nfft)=input function f(r) (real or complex)

TODO

  Write simplified API for sequential version.

PARENTS

      fourdp,m_fft

CHILDREN

SOURCE

1364 subroutine sg2002_mpifourdp(cplex,nfft,ngfft,ndat,isign,&
1365 &  fftn2_distrib,ffti2_local,fftn3_distrib,ffti3_local,fofg,fofr,comm_fft)
1366 
1367 
1368 !This section has been created automatically by the script Abilint (TD).
1369 !Do not modify the following lines by hand.
1370 #undef ABI_FUNC
1371 #define ABI_FUNC 'sg2002_mpifourdp'
1372 !End of the abilint section
1373 
1374  implicit none
1375 
1376 !Arguments ------------------------------------
1377 !scalars
1378  integer,intent(in) :: cplex,isign,nfft,ndat,comm_fft
1379 !arrays
1380  integer,intent(in) :: ngfft(18)
1381  integer,intent(in) :: fftn2_distrib(ngfft(2)),ffti2_local(ngfft(2))
1382  integer,intent(in) :: fftn3_distrib(ngfft(3)),ffti3_local(ngfft(3))
1383  real(dp),intent(inout) :: fofg(2,nfft*ndat),fofr(cplex*nfft*ndat)
1384 
1385 !Local variables-------------------------------
1386 !scalars
1387  integer :: n1,n2,n3,n4,n5,n6,nd2proc,nd3proc,nproc_fft,me_fft
1388 !arrays
1389  real(dp),allocatable :: workf(:,:,:,:,:),workr(:,:,:,:,:)
1390 
1391 ! *************************************************************************
1392 
1393  ! Note the only c2c is supported in parallel.
1394  n1=ngfft(1); n2=ngfft(2); n3=ngfft(3)
1395  n4=ngfft(4); n5=ngfft(5); n6=ngfft(6)
1396  me_fft=ngfft(11); nproc_fft=ngfft(10)
1397 
1398  nd2proc=((n2-1)/nproc_fft) +1
1399  nd3proc=((n6-1)/nproc_fft) +1
1400  ABI_ALLOCATE(workr,(2,n4,n5,nd3proc,ndat))
1401  ABI_ALLOCATE(workf,(2,n4,n6,nd2proc,ndat))
1402 
1403  ! Complex to Complex
1404  select case (isign)
1405  case (1)
1406    ! G --> R
1407    call mpifft_fg2dbox(nfft,ndat,fofg,n1,n2,n3,n4,nd2proc,n6,fftn2_distrib,ffti2_local,me_fft,workf)
1408 
1409    call sg2002_back(2,ndat,n1,n2,n3,n4,n5,n6,n4,nd2proc,nd3proc,2,workf,workr,comm_fft)
1410 
1411    call mpifft_dbox2fr(n1,n2,n3,n4,n5,nd3proc,ndat,fftn3_distrib,ffti3_local,me_fft,workr,cplex,nfft,fofr)
1412 
1413  case (-1)
1414    ! R --> G
1415    call mpifft_fr2dbox(cplex,nfft,ndat,fofr,n1,n2,n3,n4,n5,nd3proc,fftn3_distrib,ffti3_local,me_fft,workr)
1416 
1417    call sg2002_forw(2,ndat,n1,n2,n3,n4,n5,n6,n4,nd2proc,nd3proc,2,workr,workf,comm_fft)
1418 
1419    ! Transfer FFT output to the original fft box.
1420    call mpifft_dbox2fg(n1,n2,n3,n4,nd2proc,n6,ndat,fftn2_distrib,ffti2_local,me_fft,workf,nfft,fofg)
1421 
1422  case default
1423    MSG_BUG("Wrong isign")
1424  end select
1425 
1426  ABI_DEALLOCATE(workr)
1427  ABI_DEALLOCATE(workf)
1428 
1429 end subroutine sg2002_mpifourdp