/synthbench/euroben-ports/base/C/mod2f/.svn/text-base/cfft4.c.svn-base - PRACE NPT - Greek Research and Technology Network's projects

root / synthbench / euroben-ports / base / C / mod2f / .svn / text-base / cfft4.c.svn-base @ 0:839f52ef7657

History | View | Annotate | Download (13.3 kB)

       #include <math.h>
       #include <stdio.h>
       #include <stdlib.h>
       void cfft4( int is, int m, double ur[], double ui[], double xr[], double xi[],
                   double wr[], double wi[] )
       /* ----------------------------------------------------------------------
       ! --- CFFT4 performs an N-point complex-to-complex FFT, where N = 2**M.
       !     On input XR holds the Real part of the complex array;
       !     XI holds the Imaginary part.
       !     On output XR contains the Real part of the transformed array.
       !     XI then contains the Imaginary part of the result.
       !     WR and WI are work arrays that store Real and Imaginary parts
       !     of intermediate results.
       !     UR and UI contain rotation factors for various stages of the
       !     transformation.
+      !
       ! --- Before doing the actual transformation UR and UI must be
       !     initialised by calling CFFT4 with IS = 0, and M = MX, where MX
       !     is the maximum value of M that will be needed for any call of
       !     CFFT4.
+      !
       ! --- A call to CFFT4 with IS = 1 (or -1) indicates a call to perform
       !     a FFT with positive (or negative) exponentials.
+      !
       !     For any call to CFFT4 M must be M >= 2.
       !     Dimensions of the arrays XR, XI, UR, UI, WR, WI are:
+      !
       !     XR[N], XI[N], WR[N], WI[N].
       !     UR[2**MX+1],       UI[2**MX+1]       for MX even.
       !     UR[2*2**(MX/3)+1], UI[2*2**(MX/3)+1] for MX odd.
+      !
       !     In this routine we use 8-byte Real elements (doubles).
       ! ---------------------------------------------------------------------- */
+      {
          int          i,  it, j,  j0, j1, j2, j3, j4, j5, k , kn, ku, k0,
                       k1, k2, k3, ln, l1, l2, l3, l4, mx, n , nu, n1, n2, n3;
          const double pi = 3.141592653589793;
          double       c0r, c0i, c1r, c1i, c2r, c2i, c3r, c3i,
                       d0r, d0i, d1r, d1i, d2r, d2i, d3r, d3i,
                       t  , ti , u1r, u1i, u2r, u2i, u3r, u3i,
                       x1 , x2 , w1 , w2;
       // ---------------------------------------------------------------------
       // --- Initialise the U array with sines and cosines in a manner that
       //     permits stride one access at each FFT iteration.
              if ( is == 0 ) {
                nu     = (int)pow( 2, ((m+1)/2*2) )/3;
                ur[0] = 64*nu + m;
                ku    = 1;
                ln    = 1;
                for( j = 0; j <= (m+1)/2 - 1; j++ ) {
                   t = pi/(2*ln);
                   for( i = 0; i <= ln - 1; i++ ) {
                      ti       = i*t;
                      ur[i+ku] = cos( ti );
                      ui[i+ku] = sin( ti );
+                  }
                   ku = ku + ln;
                   ln = 4*ln;
+               }
                return;
+            }
       // ---------------------------------------------------------------------
       // --- Transformation call to CFFT4: set initial parameters.
             n  = (int)pow( 2, m );
             k  = ur[0];
             mx = k%64;
       // ---------------------------------------------------------------------
       // --- Check for invalid input parameters.
             if ( ( is != 1 && is != -1 ) || m < 2 || m > mx ) {
                printf( " *** Error in CFFT4: Either U has not been initialised\n" );
                printf( "  or one of the input parameters IS or M is invalid:\n" );
                printf( " is = %5d; m = %5d\n", is, m );
                abort();
+            }
       // ---------------------------------------------------------------------
             nu = k/64;
             n1 = n/4;
             n2 = 2*n1;
             n3 = 3*n1;
             ln = n1;
       // ---------------------------------------------------------------------
       // --- For the first two radix-4 iterations, the inner loop increments J
       //     instead of K. Non-unit stride memory access is used but the longer
       //     vector lengths should compensate for that.
             j0 = 0;
             j1 = j0 + 1;
             j2 = j0 + 2;
             j3 = j0 + 3;
             k0 = j0;
             k1 = k0 + n1;
             k2 = k0 + n2;
             k3 = k0 + n3;
             for( j = 0; j <= ln - 1; j++ ) {
                c0r = xr[j+k0];
                c0i = xi[j+k0];
                c1r = xr[j+k1];
                c1i = xi[j+k1];
                c2r = xr[j+k2];
                c2i = xi[j+k2];
                c3r = xr[j+k3];
                c3i = xi[j+k3];
                d0r = c0r + c2r;
                d0i = c0i + c2i;
                d1r = c0r - c2r;
                d1i = c0i - c2i;
                d2r = c1r + c3r;
                d2i = c1i + c3i;
                d3r = is*(-c1i + c3i);
                d3i = is*( c1r - c3r);
                j4  = 4*j;
                wr[j4+j0] = d0r + d2r;
                wi[j4+j0] = d0i + d2i;
                wr[j4+j1] = d1r + d3r;
                wi[j4+j1] = d1i + d3i;
                wr[j4+j2] = d0r - d2r;
                wi[j4+j2] = d0i - d2i;
                wr[j4+j3] = d1r - d3r;
                wi[j4+j3] = d1i - d3i;
+            }
       // ---------------------------------------------------------------------
       // --- If M = 2 all transformation work is done, just copy WR/WI into
       //     XR/XI (label L200).
       //     If M = 3, one radix-2 iteration still has to be done (label L300).
             if ( m == 2 ) goto L200;
             if ( m == 3 ) goto L300;
       // --- Else:
       // ----------------------------------------------------------------------
       // --- Perform the second radix-4 iteration.
             ln = ln/4;
             for( k = 0; k <= 3; k++ ) {
                j0  = k;
                j1  = j0 + 4;
                j2  = j0 + 8;
                j3  = j0 + 12;
                k0  = j0;
                k1  = k0 + n1;
                k2  = k0 + n2;
                k3  = k0 + n3;
                u1r =    ur[k+2];
                u1i = is*ui[k+2];
                u2r = u1r*u1r - u1i*u1i;
                u2i = 2.0*u1r*u1i;
                u3r = u1r*u2r - u1i*u2i;
                u3i = u1r*u2i + u1i*u2r;
                for( j = 0; j <= ln - 1; j++ ) {
                   j4  = 4*j;
                   c0r = wr[j4+k0];
                   c0i = wi[j4+k0];
                   w1  = wr[j4+k1];
                   w2  = wi[j4+k1];
                   c1r = u1r*w1 - u1i*w2;
                   c1i = u1r*w2 + u1i*w1;
                   w1  = wr[j4+k2];
                   w2  = wi[j4+k2];
                   c2r = u2r*w1 - u2i*w2;
                   c2i = u2r*w2 + u2i*w1;
                   w1  = wr[j4+k3];
                   w2  = wi[j4+k3];
                   c3r = u3r*w1 - u3i*w2;
                   c3i = u3r*w2 + u3i*w1;
                   d0r = c0r + c2r;
                   d0i = c0i + c2i;
                   d1r = c0r - c2r;
                   d1i = c0i - c2i;
                   d2r = c1r + c3r;
                   d2i = c1i + c3i;
                   d3r = is*(-c1i + c3i);
                   d3i = is*( c1r - c3r);
                   j5  = 16*j;
                   xr[j5+j0] = d0r + d2r;
                   xi[j5+j0] = d0i + d2i;
                   xr[j5+j1] = d1r + d3r;
                   xi[j5+j1] = d1i + d3i;
                   xr[j5+j2] = d0r - d2r;
                   xi[j5+j2] = d0i - d2i;
                   xr[j5+j3] = d1r - d3r;
                   xi[j5+j3] = d1i - d3i;
+               }
+            }
       // ---------------------------------------------------------------------
       // --- Set parameters for subsequent iterations.
             ku = 7;
             ln = ln/4;
             l1 = 16;
             l2 = 32;
             l3 = 48;
             l4 = 64;
       // ---------------------------------------------------------------------
       // --- For all other radix-4 iterations, the inner loops increment K. In
       //     this manner, all array references have unit stride. The radix-4
       //     iterations are performed in pairs: XR/XI to WR/WI and vice
       //     versa. This eliminates the need to copy data between X and W
       //     after each iteration.
       // --- If IT = M - 1 then only one radix-2 iteration needs to be done
       //     (label L100).
             for( it = 4; it <= m - 1; it+= 4 ) {
                if ( it == m - 1 ) goto L100;
                for(  j = 0; j <= ln - 1; j++ ) {
                   j0 = j*l4;
                   j1 = j0 + l1;
                   j2 = j0 + l2;
                   j3 = j0 + l3;
                   k0 = j*l1;
                   k1 = k0 + n1;
                   k2 = k0 + n2;
                   k3 = k0 + n3;
                   for( k = 0; k <= l1 - 1; k++ ) {
                      u1r =    ur[k+ku-1];
                      u1i = is*ui[k+ku-1];
                      u2r = u1r*u1r - u1i*u1i;
                      u2i = 2.0*u1r*u1i;
                      u3r = u1r*u2r - u1i*u2i;
                      u3i = u1r*u2i + u1i*u2r;
                      c0r = xr[k+k0];
                      c0i = xi[k+k0];
                      x1  = xr[k+k1];
                      x2  = xi[k+k1];
                      c1r = u1r*x1 - u1i*x2;
                      c1i = u1r*x2 + u1i*x1;
                      x1  = xr[k+k2];
                      x2  = xi[k+k2];
                      c2r = u2r*x1 - u2i*x2;
                      c2i = u2r*x2 + u2i*x1;
                      x1  = xr[k+k3];
                      x2  = xi[k+k3];
                      c3r = u3r*x1 - u3i*x2;
                      c3i = u3r*x2 + u3i*x1;
                      d0r = c0r + c2r;
                      d0i = c0i + c2i;
                      d1r = c0r - c2r;
                      d1i = c0i - c2i;
                      d2r = c1r + c3r;
                      d2i = c1i + c3i;
                      d3r = is*(-c1i + c3i);
                      d3i = is*( c1r - c3r);
                      wr[k+j0] = d0r + d2r;
                      wi[k+j0] = d0i + d2i;
                      wr[k+j1] = d1r + d3r;
                      wi[k+j1] = d1i + d3i;
                      wr[k+j2] = d0r - d2r;
                      wi[k+j2] = d0i - d2i;
                      wr[k+j3] = d1r - d3r;
                      wi[k+j3] = d1i - d3i;
+                  }
+               }
                ku = ku + l1;
                ln = ln/4;
                l1 = l4;
                l2 = 2*l1;
                l3 = 3*l1;
                l4 = 4*l1;
       //  ----------------------------------------------------------------------
       // --- If IT = M - 2 the computation is complete except for copying
       //                   WR/WI to XR/XI (label L200).
       // --- If IT = M - 3 then only one radix-2 iteration remains to be done
       //                   (label L300).
                if ( it == m - 2 ) goto L200;
                if ( it == m - 3 ) goto L300;
                for( j = 0; j <= ln - 1; j++ ) {
                   j0 = j*l4;
                   j1 = j0 + l1;
                   j2 = j0 + l2;
                   j3 = j0 + l3;
                   k0 = j*l1;
                   k1 = k0 + n1;
                   k2 = k0 + n2;
                   k3 = k0 + n3;
                   for( k = 0; k <= l1 - 1; k++ ) {
                      u1r =    ur[k+ku-1];
                      u1i = is*ui[k+ku-1];
                      u2r = u1r*u1r - u1i*u1i;
                      u2i = 2.0*u1r*u1i;
                      u3r = u1r*u2r - u1i*u2i;
                      u3i = u1r*u2i + u1i*u2r;
                      c0r = wr[k+k0];
                      c0i = wi[k+k0];
                      w1  = wr[k+k1];
                      w2  = wi[k+k1];
                      c1r = u1r*w1 - u1i*w2;
                      c1i = u1r*w2 + u1i*w1;
                      w1  = wr[k+k2];
                      w2  = wi[k+k2];
                      c2r = u2r*w1 - u2i*w2;
                      c2i = u2r*w2 + u2i*w1;
                      w1  = wr[k+k3];
                      w2  = wi[k+k3];
                      c3r = u3r*w1 - u3i*w2;
                      c3i = u3r*w2 + u3i*w1;
                      d0r = c0r + c2r;
                      d0i = c0i + c2i;
                      d1r = c0r - c2r;
                      d1i = c0i - c2i;
                      d2r = c1r + c3r;
                      d2i = c1i + c3i;
                      d3r = is*(-c1i + c3i);
                      d3i = is*( c1r - c3r);
                      xr[k+j0] = d0r + d2r;
                      xi[k+j0] = d0i + d2i;
                      xr[k+j1] = d1r + d3r;
                      xi[k+j1] = d1i + d3i;
                      xr[k+j2] = d0r - d2r;
                      xi[k+j2] = d0i - d2i;
                      xr[k+j3] = d1r - d3r;
                      xi[k+j3] = d1i - d3i;
+                  }
+               }
                ku = ku + l1;
                ln = ln/4;
                l1 = l4;
                l2 = 2*l1;
                l3 = 3*l1;
                l4 = 4*l1;
+            }
             return;
       //  ----------------------------------------------------------------------
       // --- Perform one radix-2 iteration to complete the calculation. This
       //     branch is taken when the last iteration performed above is stored
       //     in XR/XI.
       L100: l2 = l1/2;
             j0 = 0;
             j1 = j0 + l1;
             j2 = j0 + l2;
             j3 = j1 + l2;
             k0 = 0;
             k1 = k0 + l1;
             k2 = k0 + l2;
             k3 = k1 + l2;
             for( k = 0; k <= l2 - 1; k++ ) {
                u1r =    ur[2*k+ku-1];
                u1i = is*ui[2*k+ku-1];
                c0r = xr[k+k0];
                c0i = xi[k+k0];
                x1  = xr[k+k1];
                x2  = xi[k+k1];
                c1r = u1r*x1 - u1i*x2;
                c1i = u1r*x2 + u1i*x1;
                wr[k+j0] = c0r + c1r;
                wi[k+j0] = c0i + c1i;
                wr[k+j1] = c0r - c1r;
                wi[k+j1] = c0i - c1i;
                u2r = -is*u1i;
                u2i =  is*u1r;
                c0r = xr[k+k2];
                c0i = xi[k+k2];
                x1  = xr[k+k3];
                x2  = xi[k+k3];
                c1r = u2r*x1 - u2i*x2;
                c1i = u2r*x2 + u2i*x1;
                wr[k+j2] = c0r + c1r;
                wi[k+j2] = c0i + c1i;
                wr[k+j3] = c0r - c1r;
                wi[k+j3] = c0i - c1i;
+            }
       L200: for( k = 0; k <= n - 1; k++ ) {
                xr[k] = wr[k];
                xi[k] = wi[k];
+            }
             return;
       // ---------------------------------------------------------------------
       // --- Perform one radix-2 iteration to complete the calculation. This
       //     branch is taken if the last iteration performed above is stored
       //     in W. If M = 3, the usual settings of L1 and KU do not hold, so
       //     they have to be set in this case.
       L300: if ( m == 3 ) {
                l1 = 4;
                ku = 3;
+            }
             l2 = l1/2;
             j0 = 0;
             j1 = j0 + l1;
             j2 = j0 + l2;
             j3 = j1 + l2;
             k0 = 0;
             k1 = k0 + l1;
             k2 = k0 + l2;
             k3 = k1 + l2;
             for( k = 0; k <= l2 - 1; k++ ) {
                u1r =    ur[2*k+ku-1];
                u1i = is*ui[2*k+ku-1];
                c0r = wr[k+k0];
                c0i = wi[k+k0];
                w1  = wr[k+k1];
                w2  = wi[k+k1];
                c1r = u1r*w1 - u1i*w2;
                c1i = u1r*w2 + u1i*w1;
                xr[k+j0] = c0r + c1r;
                xi[k+j0] = c0i + c1i;
                xr[k+j1] = c0r - c1r;
                xi[k+j1] = c0i - c1i;
                u2r = -is*u1i;
                u2i =  is*u1r;
                c0r = wr[k+k2];
                c0i = wi[k+k2];
                w1  = wr[k+k3];
                w2  = wi[k+k3];
                c1r = u2r*w1 - u2i*w2;
                c1i = u2r*w2 + u2i*w1;
                xr[k+j2] = c0r + c1r;
                xi[k+j2] = c0i + c1i;
                xr[k+j3] = c0r - c1r;
                xi[k+j3] = c0i - c1i;
+            }
       // ---------------------------------------------------------------------
       // --- Finished: Return
+      }