It’s been a while since I checked here… I got a little discouraged after a while. Anyway, I’ve got a working 1k FFT, and parts of a 2k FFT. The 2k (that is ported from vvolkov’s code) is a 512 element and 4 element interleaved. It doesn’t work currently, and I’ve kinda stalled on it. I’m posting the OpenCL kernel code with this. It’s pretty simple so use, the local workgroup size is 64,1,1 and there is an optional twiddle factor pre-computation step (in the case of the 1k fft).
twiddle factor computation code
void exp_i(cl_float2 a, float phi )
{
a[0] = cosf(phi);
a[1] = sinf(phi);
}
void computeTwiddleFactors( cl_float2 *array )
{
// Twiddle16 factors
for (int i = 0; i < 64; i++) {
exp_i(array[ 0+i*16], (-2. * M_PI * 0 / 1024) * i);
exp_i(array[ 1+i*16], (-2. * M_PI * 8 / 1024) * i);
exp_i(array[ 2+i*16], (-2. * M_PI * 4 / 1024) * i);
exp_i(array[ 3+i*16], (-2. * M_PI * 12 / 1024) * i);
exp_i(array[ 4+i*16], (-2. * M_PI * 2 / 1024) * i);
exp_i(array[ 5+i*16], (-2. * M_PI * 10 / 1024) * i);
exp_i(array[ 6+i*16], (-2. * M_PI * 6 / 1024) * i);
exp_i(array[ 7+i*16], (-2. * M_PI * 14 / 1024) * i);
exp_i(array[ 8+i*16], (-2. * M_PI * 1 / 1024) * i);
exp_i(array[ 9+i*16], (-2. * M_PI * 9 / 1024) * i);
exp_i(array[10+i*16], (-2. * M_PI * 5 / 1024) * i);
exp_i(array[11+i*16], (-2. * M_PI * 13 / 1024) * i);
exp_i(array[12+i*16], (-2. * M_PI * 3 / 1024) * i);
exp_i(array[13+i*16], (-2. * M_PI * 11 / 1024) * i);
exp_i(array[14+i*16], (-2. * M_PI * 7 / 1024) * i);
exp_i(array[15+i*16], (-2. * M_PI * 15 / 1024) * i);
}
// Twiddle4x4 factors
array += 1024;
for (int i = 0; i < 16; i++) {
exp_i(array[0+3*i], (-2. * M_PI/32.) * i);
exp_i(array[1+3*i], (-1. * M_PI/32.) * i);
exp_i(array[2+3*i], (-3. * M_PI/32.) * i);
}
}
OpenCL Code
/*
* 1024FFT.cl
* CLFFT
*
* Created by William Dillon on 6/18/09.
* Adapted from Vasily Volkov's CUDA FFT http://www.eecs.berkeley.edu/~volkov/
* Copyright 2009 Oregon State University. All rights reserved.
*
*/
#define COS_PI_8 0.923879533f
#define SIN_PI_8 0.382683432f
#define M_SQRT1_2 7.0710678118654752440E-1
#define M_PI 3.141592653589793
#define exp_1_16 (float2)( COS_PI_8, -SIN_PI_8 )
#define exp_3_16 (float2)( SIN_PI_8, -COS_PI_8 )
#define exp_5_16 (float2)( -SIN_PI_8, -COS_PI_8 )
#define exp_7_16 (float2)( -COS_PI_8, -SIN_PI_8 )
#define exp_9_16 (float2)( -COS_PI_8, SIN_PI_8 )
#define exp_1_8 (float2)( 1, -1 )//requires post-multiply by 1/sqrt(2)
#define exp_1_4 (float2)( 0, -1 )
#define exp_3_8 (float2)( -1, -1 )//requires post-multiply by 1/sqrt(2)
#define NULL 0
#define UNROLL_LOOPS
//#define PRECOMPUTE_FACTORS
inline float2 complex_mult( float2 a, float2 b )
{
return make_float2( a.x*b.x-a.y*b.y, a.x*b.y+a.y*b.x );
}
inline void FFT2(float2 *a0, float2 *a1 )
{
float2 c0 = *a0;
*a0 = c0 + *a1;
*a1 = c0 - *a1;
}
inline void FFT4(float2 *a0, float2 *a1, float2 *a2, float2 *a3 )
{
FFT2( a0, a2 );
FFT2( a1, a3 );
*a3 = complex_mult(*a3, exp_1_4);
FFT2( a0, a1 );
FFT2( a2, a3 );
}
inline void FFT8(float2 *a)
{
FFT2( &a[0], &a[4] );
FFT2( &a[1], &a[5] );
FFT2( &a[2], &a[6] );
FFT2( &a[3], &a[7] );
a[5] = complex_mult(a[5], exp_1_8) * M_SQRT1_2;
a[6] = complex_mult(a[6], exp_1_4);
a[7] = complex_mult(a[7], exp_3_8) * M_SQRT1_2;
FFT4( &a[0], &a[1], &a[2], &a[3] );
FFT4( &a[4], &a[5], &a[6], &a[7] );
}
inline void FFT16(float2 *a )
{
FFT4( &a[0], &a[4], &a[8], &a[12] );
FFT4( &a[1], &a[5], &a[9], &a[13] );
FFT4( &a[2], &a[6], &a[10], &a[14] );
FFT4( &a[3], &a[7], &a[11], &a[15] );
a[5] = complex_mult(a[5], exp_1_8 ) * M_SQRT1_2;
a[6] = complex_mult(a[6], exp_1_4 );
a[7] = complex_mult(a[7], exp_3_8 ) * M_SQRT1_2;
a[9] = complex_mult(a[9], exp_1_16);
a[10] = complex_mult(a[10], exp_1_8 ) * M_SQRT1_2;
a[11] = complex_mult(a[11], exp_3_16);
a[13] = complex_mult(a[13], exp_3_16);
a[14] = complex_mult(a[14], exp_3_8 ) * M_SQRT1_2;
a[15] = complex_mult(a[15], exp_9_16);
FFT4( &a[0], &a[1], &a[2], &a[3] );
FFT4( &a[4], &a[5], &a[6], &a[7] );
FFT4( &a[8], &a[9], &a[10], &a[11] );
FFT4( &a[12], &a[13], &a[14], &a[15] );
}
inline void FFT4x4(float2 *a )
{
FFT4( &a[ 0], &a[ 1], &a[ 2], &a[ 3] );
FFT4( &a[ 4], &a[ 5], &a[ 6], &a[ 7] );
FFT4( &a[ 8], &a[ 9], &a[10], &a[11] );
FFT4( &a[12], &a[13], &a[14], &a[15] );
}
//
// Store results in global memory
//
#ifdef UNROLL_LOOPS
inline void store4( float2 *a, global float2 *x, int sx )
{
// Unrolled using the rev[] array as indices into a, and hard-coded sx
x[ 0*sx] = a[0];
x[ 1*sx] = a[2];
x[ 2*sx] = a[1];
x[ 3*sx] = a[3];
}
inline void store8( float2 *a, global float2 *x)
{
float scalar = sqrt(512.);
// Unrolled using the rev[] array as indices into a, and hard-coded sx
x[ 0*64] = a[0] / scalar;
x[ 1*64] = a[4] / scalar;
x[ 2*64] = a[2] / scalar;
x[ 3*64] = a[6] / scalar;
x[ 4*64] = a[1] / scalar;
x[ 5*64] = a[5] / scalar;
x[ 6*64] = a[3] / scalar;
x[ 7*64] = a[7] / scalar;
}
inline void store16( float2 *a, global float2 *x)
{
float scalar = 32; //sqrt(1024.);
// Unrolled using the rev[] array as indices into a, and hard-coded sx
x[ 0*64] = a[ 0] / scalar;
x[ 1*64] = a[ 8] / scalar;
x[ 2*64] = a[ 4] / scalar;
x[ 3*64] = a[12] / scalar;
x[ 4*64] = a[ 2] / scalar;
x[ 5*64] = a[10] / scalar;
x[ 6*64] = a[ 6] / scalar;
x[ 7*64] = a[14] / scalar;
x[ 8*64] = a[ 1] / scalar;
x[ 9*64] = a[ 9] / scalar;
x[10*64] = a[ 5] / scalar;
x[11*64] = a[13] / scalar;
x[12*64] = a[ 3] / scalar;
x[13*64] = a[11] / scalar;
x[14*64] = a[ 7] / scalar;
x[15*64] = a[15] / scalar;
}
#else
inline void store4( float2 *a, global float2 *x, int sx )
{
int rev[8] = {0,2,1,3};
for( int i = 0; i < 4; i++ )
x[i*sx] = a[rev[i]];
}
inline void store8( float2 *a, global float2 *x, int sx )
{
float scalar = sqrt(512.);
int rev[8] = {0,4,2,6,1,5,3,7};
for( int i = 0; i < 8; i++ )
x[i*sx] = a[rev[i]] / scalar;
}
inline void store16( float2 *a, global float2 *x, int sx )
{
float scalar = 32; //sqrt(1024.);
int rev[16] = { 0, 8, 4,12, 2,10, 6,14, 1, 9, 5,13, 3,11, 7,15};
for( int i = 0; i < 16; i++ )
x[i*sx] = a[rev[i]] / scalar;
}
#endif
//
// transpose via shared memory, input is in bit-reversed layout
//
#ifdef UNROLL_LOOPS
inline void storex8( float2 *a, local float *x, int sx )
{
x[ 0*sx] = a[0].x;
x[ 1*sx] = a[4].x;
x[ 2*sx] = a[2].x;
x[ 3*sx] = a[6].x;
x[ 4*sx] = a[1].x;
x[ 5*sx] = a[5].x;
x[ 6*sx] = a[3].x;
x[ 7*sx] = a[7].x;
}
inline void storex16( float2 *a, local float *x, int sx )
{
x[ 0*sx] = a[ 0].x;
x[ 1*sx] = a[ 8].x;
x[ 2*sx] = a[ 4].x;
x[ 3*sx] = a[12].x;
x[ 4*sx] = a[ 2].x;
x[ 5*sx] = a[10].x;
x[ 6*sx] = a[ 6].x;
x[ 7*sx] = a[14].x;
x[ 8*sx] = a[ 1].x;
x[ 9*sx] = a[ 9].x;
x[10*sx] = a[ 5].x;
x[11*sx] = a[13].x;
x[12*sx] = a[ 3].x;
x[13*sx] = a[11].x;
x[14*sx] = a[ 7].x;
x[15*sx] = a[15].x;
}
#else
inline void storex8( float2 *a, local float *x, int sx )
{
int rev[8] = {0,4,2,6,1,5,3,7};
for( int i = 0; i < 8; i++ )
x[i*sx] = a[ rev[i] ].x;
}
inline void storex16( float2 *a, local float *x, int sx )
{
int rev[16] = { 0, 8, 4,12, 2,10, 6,14, 1, 9, 5,13, 3,11, 7,15};
for( int i = 0; i < 16; i++ )
x[i*sx] = a[ rev[i] ].x;
}
#endif
#ifdef UNROLL_LOOPS
inline void storey8( float2 *a, local float *x, int sx )
{
x[ 0*sx] = a[0].y;
x[ 1*sx] = a[4].y;
x[ 2*sx] = a[2].y;
x[ 3*sx] = a[6].y;
x[ 4*sx] = a[1].y;
x[ 5*sx] = a[5].y;
x[ 6*sx] = a[3].y;
x[ 7*sx] = a[7].y;
}
inline void storey16( float2 *a, local float *x, int sx )
{
x[ 0*sx] = a[ 0].y;
x[ 1*sx] = a[ 8].y;
x[ 2*sx] = a[ 4].y;
x[ 3*sx] = a[12].y;
x[ 4*sx] = a[ 2].y;
x[ 5*sx] = a[10].y;
x[ 6*sx] = a[ 6].y;
x[ 7*sx] = a[14].y;
x[ 8*sx] = a[ 1].y;
x[ 9*sx] = a[ 9].y;
x[10*sx] = a[ 5].y;
x[11*sx] = a[13].y;
x[12*sx] = a[ 3].y;
x[13*sx] = a[11].y;
x[14*sx] = a[ 7].y;
x[15*sx] = a[15].y;
}
#else
inline void storey8( float2 *a, local float *x, int sx )
{
int rev[8] = {0,4,2,6,1,5,3,7};
for( int i = 0; i < 8; i++ )
x[i*sx] = a[ rev[i] ].y;
}
inline void storey16( float2 *a, local float *x, int sx )
{
int rev[16] = { 0, 8, 4,12, 2,10, 6,14, 1, 9, 5,13, 3,11, 7,15};
for( int i = 0; i < 16; i++ )
x[i*sx] = a[ rev[i] ].y;
}
#endif
#ifdef UNROLL_LOOPS
inline void storex4x4( float2 *a, local float *x, int sx)
{
// Unrolled using the rev4x4[] array as indices into a, and hard-coded sx
x[ 0*69] = a[ 0].x;
x[ 1*69] = a[ 2].x;
x[ 2*69] = a[ 1].x;
x[ 3*69] = a[ 3].x;
x[ 4*69] = a[ 4].x;
x[ 5*69] = a[ 6].x;
x[ 6*69] = a[ 5].x;
x[ 7*69] = a[ 7].x;
x[ 8*69] = a[ 8].x;
x[ 9*69] = a[10].x;
x[10*69] = a[ 9].x;
x[11*69] = a[11].x;
x[12*69] = a[12].x;
x[13*69] = a[14].x;
x[14*69] = a[13].x;
x[15*69] = a[15].x;
}
#else
inline void storex4x4( float2 *a, local float *x, int sx)
{
int rev4x4[16] = { 0, 2, 1, 3, 4, 6, 5, 7, 8,10, 9,11,12,14,13,15};
for( int i = 0; i < 16; i++ )
x[i*sx] = a[ rev4x4[i] ].x;
}
#endif
#ifdef UNROLL_LOOPS
inline void storey4x4( float2 *a, local float *x, int sx)
{
// Unrolled using the rev4x4[] array as indices into a, and hard-coded sx
x[ 0*69] = a[ 0].y;
x[ 1*69] = a[ 2].y;
x[ 2*69] = a[ 1].y;
x[ 3*69] = a[ 3].y;
x[ 4*69] = a[ 4].y;
x[ 5*69] = a[ 6].y;
x[ 6*69] = a[ 5].y;
x[ 7*69] = a[ 7].y;
x[ 8*69] = a[ 8].y;
x[ 9*69] = a[10].y;
x[10*69] = a[ 9].y;
x[11*69] = a[11].y;
x[12*69] = a[12].y;
x[13*69] = a[14].y;
x[14*69] = a[13].y;
x[15*69] = a[15].y;
}
#else
inline void storey4x4( float2 *a, local float *x, int sx)
{
int rev4x4[16] = { 0, 2, 1, 3, 4, 6, 5, 7, 8,10, 9,11,12,14,13,15};
for( int i = 0; i < 16; i++ )
x[i*sx] = a[ rev4x4[i] ].y;
}
#endif
#ifdef UNROLL_LOOPS
inline void load4( float2 *a, global float2 *x, int sx )
{
a[ 0] = x[ 0*sx];
a[ 1] = x[ 1*sx];
a[ 2] = x[ 2*sx];
a[ 3] = x[ 3*sx];
}
inline void load8( float2 *a, global float2 *x, int sx )
{
a[ 0] = x[ 0*sx];
a[ 1] = x[ 1*sx];
a[ 2] = x[ 2*sx];
a[ 3] = x[ 3*sx];
a[ 4] = x[ 4*sx];
a[ 5] = x[ 5*sx];
a[ 6] = x[ 6*sx];
a[ 7] = x[ 7*sx];
}
inline void load16( float2 *a, global float2 *x, int sx )
{
a[ 0] = x[ 0*sx];
a[ 1] = x[ 1*sx];
a[ 2] = x[ 2*sx];
a[ 3] = x[ 3*sx];
a[ 4] = x[ 4*sx];
a[ 5] = x[ 5*sx];
a[ 6] = x[ 6*sx];
a[ 7] = x[ 7*sx];
a[ 8] = x[ 8*sx];
a[ 9] = x[ 9*sx];
a[10] = x[10*sx];
a[11] = x[11*sx];
a[12] = x[12*sx];
a[13] = x[13*sx];
a[14] = x[14*sx];
a[15] = x[15*sx];
}
#else
inline void load8( float2 *a, global float2 *x, int sx)
{
for(int i = 0; i < 4; i++)
a[i] = x[i*sx];
}
inline void load8( float2 *a, global float2 *x, int sx )
{
for(int i = 0; i < 8; i++)
a[i] = x[i*sx];
}
inline void load16( float2 *a, global float2 *x, int sx )
{
for(int i = 0; i < 16; i++)
a[i] = x[i*sx];
}
#endif
#ifdef UNROLL_LOOPS
inline void loadx8i( float2 *a, local float *x, int sx )
{
a[ 0].x = x[ 0*sx];
a[ 1].x = x[ 1*sx];
a[ 2].x = x[ 2*sx];
a[ 3].x = x[ 3*sx];
a[ 4].x = x[ 4*sx];
a[ 5].x = x[ 5*sx];
a[ 6].x = x[ 6*sx];
a[ 7].x = x[ 7*sx];
}
inline void loadx16i( float2 *a, local float *x, int sx )
{
a[ 0].x = x[ 0*sx];
a[ 1].x = x[ 1*sx];
a[ 2].x = x[ 2*sx];
a[ 3].x = x[ 3*sx];
a[ 4].x = x[ 4*sx];
a[ 5].x = x[ 5*sx];
a[ 6].x = x[ 6*sx];
a[ 7].x = x[ 7*sx];
a[ 8].x = x[ 8*sx];
a[ 9].x = x[ 9*sx];
a[10].x = x[10*sx];
a[11].x = x[11*sx];
a[12].x = x[12*sx];
a[13].x = x[13*sx];
a[14].x = x[14*sx];
a[15].x = x[15*sx];
}
#else
inline void loadx8i( float2 *a, local float *x, int sx )
{
for( int i = 0; i < 8; i++ )
a[i].x = x[i*sx];
}
inline void loadx16i( float2 *a, local float *x, int sx )
{
for( int i = 0; i < 16; i++ )
a[i].x = x[i*sx];
}
#endif
#ifdef UNROLL_LOOPS
inline void loady8i( float2 *a, local float *x, int sx )
{
a[ 0].y = x[ 0*sx];
a[ 1].y = x[ 1*sx];
a[ 2].y = x[ 2*sx];
a[ 3].y = x[ 3*sx];
a[ 4].y = x[ 4*sx];
a[ 5].y = x[ 5*sx];
a[ 6].y = x[ 6*sx];
a[ 7].y = x[ 7*sx];
}
inline void loady16i( float2 *a, local float *x, int sx )
{
a[ 0].y = x[ 0*sx];
a[ 1].y = x[ 1*sx];
a[ 2].y = x[ 2*sx];
a[ 3].y = x[ 3*sx];
a[ 4].y = x[ 4*sx];
a[ 5].y = x[ 5*sx];
a[ 6].y = x[ 6*sx];
a[ 7].y = x[ 7*sx];
a[ 8].y = x[ 8*sx];
a[ 9].y = x[ 9*sx];
a[10].y = x[10*sx];
a[11].y = x[11*sx];
a[12].y = x[12*sx];
a[13].y = x[13*sx];
a[14].y = x[14*sx];
a[15].y = x[15*sx];
}
#else
inline void loady8i( float2 *a, local float *x, int sx )
{
for( int i = 0; i < 8; i++ )
a[i].y = x[i*sx];
}
inline void loady16i( float2 *a, local float *x, int sx )
{
for( int i = 0; i < 16; i++ )
a[i].y = x[i*sx];
}
#endif
#ifdef UNROLL_LOOPS
inline void loadx16p( float2 *a, local float *x, int *ind )
{
a[ 0].x = x[ 0];
a[ 1].x = x[ 1];
a[ 2].x = x[ 2];
a[ 3].x = x[ 3];
a[ 4].x = x[16];
a[ 5].x = x[17];
a[ 6].x = x[18];
a[ 7].x = x[19];
a[ 8].x = x[32];
a[ 9].x = x[33];
a[10].x = x[34];
a[11].x = x[35];
a[12].x = x[48];
a[13].x = x[49];
a[14].x = x[50];
a[15].x = x[51];
}
#else
inline void loadx16p( float2 *a, local float *x, int *ind )
{
for( int i = 0; i < 16; i++ )
a[i].x = x[ind[i]];
}
#endif
#ifdef UNROLL_LOOPS
inline void loady16p( float2 *a, local float *x, int *ind )
{
a[ 0].y = x[ 0];
a[ 1].y = x[ 1];
a[ 2].y = x[ 2];
a[ 3].y = x[ 3];
a[ 4].y = x[16];
a[ 5].y = x[17];
a[ 6].y = x[18];
a[ 7].y = x[19];
a[ 8].y = x[32];
a[ 9].y = x[33];
a[10].y = x[34];
a[11].y = x[35];
a[12].y = x[48];
a[13].y = x[49];
a[14].y = x[50];
a[15].y = x[51];
}
#else
inline void loady16p( float2 *a, local float *x, int *ind )
{
for( int i = 0; i < 16; i++ )
a[i].y = x[ind[i]];
}
#endif
inline void transpose8( float2 *a, local float *s, int ds, local float *l, int dl)
{
storex8( a, s, ds ); barrier(CLK_LOCAL_MEM_FENCE);
loadx8i( a, l, dl ); barrier(CLK_LOCAL_MEM_FENCE);
storey8( a, s, ds ); barrier(CLK_LOCAL_MEM_FENCE);
loady8i( a, l, dl ); barrier(CLK_LOCAL_MEM_FENCE);
}
inline void transpose16( float2 *a, local float *s, int ds, local float *l)
{
int il[] = {0,1,2,3, 16,17,18,19, 32,33,34,35, 48,49,50,51};
storex16( a, s, ds ); barrier(CLK_LOCAL_MEM_FENCE);
loadx16p( a, l, il ); barrier(CLK_LOCAL_MEM_FENCE);
storey16( a, s, ds ); barrier(CLK_LOCAL_MEM_FENCE);
loady16p( a, l, il ); barrier(CLK_LOCAL_MEM_FENCE);
}
inline void transpose4x4( float2 *a, local float *s, int ds, local float *l, int dl)
{
storex4x4( a, s, ds); barrier(CLK_LOCAL_MEM_FENCE);
loadx16i ( a, l, dl); barrier(CLK_LOCAL_MEM_FENCE);
storey4x4( a, s, ds); barrier(CLK_LOCAL_MEM_FENCE);
loady16i ( a, l, dl);
}
//
// multiply by twiddle factors in bit-reversed order
//
inline float2 exp_i( float phi )
{
return make_float2( cos(phi), sin(phi) );
}
inline float2 complex_mult_const( float2 a, global float2 b )
{
return make_float2( a.x*b.x-a.y*b.y, a.x*b.y+a.y*b.x );
}
#ifdef UNROLL_LOOPS
#ifdef PRECOMPUTE_FACTORS
inline void twiddle4(float2 *a, global float2 *factors)
{
a[ 0] = complex_mult( a[ 0], factors[ 0]);
a[ 1] = complex_mult( a[ 1], factors[ 1]);
a[ 2] = complex_mult( a[ 2], factors[ 2]);
a[ 3] = complex_mult( a[ 3], factors[ 3]);
}
inline void twiddle8(float2 *a, global float2 *factors)
{
a[ 0] = complex_mult( a[ 0], factors[ 0]);
a[ 1] = complex_mult( a[ 1], factors[ 1]);
a[ 2] = complex_mult( a[ 2], factors[ 2]);
a[ 3] = complex_mult( a[ 3], factors[ 3]);
a[ 4] = complex_mult( a[ 4], factors[ 4]);
a[ 5] = complex_mult( a[ 5], factors[ 5]);
a[ 6] = complex_mult( a[ 6], factors[ 6]);
a[ 7] = complex_mult( a[ 7], factors[ 7]);
}
inline void twiddle16(float2 *a, global float2 *factors)
{
a[ 0] = complex_mult( a[ 0], factors[ 0]);
a[ 1] = complex_mult( a[ 1], factors[ 1]);
a[ 2] = complex_mult( a[ 2], factors[ 2]);
a[ 3] = complex_mult( a[ 3], factors[ 3]);
a[ 4] = complex_mult( a[ 4], factors[ 4]);
a[ 5] = complex_mult( a[ 5], factors[ 5]);
a[ 6] = complex_mult( a[ 6], factors[ 6]);
a[ 7] = complex_mult( a[ 7], factors[ 7]);
a[ 8] = complex_mult( a[ 8], factors[ 8]);
a[ 9] = complex_mult( a[ 9], factors[ 9]);
a[10] = complex_mult( a[10], factors[10]);
a[11] = complex_mult( a[11], factors[11]);
a[12] = complex_mult( a[12], factors[12]);
a[13] = complex_mult( a[13], factors[13]);
a[14] = complex_mult( a[14], factors[14]);
a[15] = complex_mult( a[15], factors[15]);
}
#else // PRECOMPUTE_FACTORS
inline void twiddle4(float2 *a, int i, int n )
{
a[ 0] = complex_mult( a[ 0], exp_i((-2. * M_PI * 0 / n) * i));
a[ 1] = complex_mult( a[ 1], exp_i((-2. * M_PI * 2 / n) * i));
a[ 2] = complex_mult( a[ 2], exp_i((-2. * M_PI * 1 / n) * i));
a[ 3] = complex_mult( a[ 3], exp_i((-2. * M_PI * 3 / n) * i));
}
inline void twiddle8(float2 *a, int i, int n )
{
a[ 0] = complex_mult( a[ 0], exp_i((-2. * M_PI * 0 / n) * i));
a[ 1] = complex_mult( a[ 1], exp_i((-2. * M_PI * 4 / n) * i));
a[ 2] = complex_mult( a[ 2], exp_i((-2. * M_PI * 2 / n) * i));
a[ 3] = complex_mult( a[ 3], exp_i((-2. * M_PI * 6 / n) * i));
a[ 4] = complex_mult( a[ 4], exp_i((-2. * M_PI * 1 / n) * i));
a[ 5] = complex_mult( a[ 5], exp_i((-2. * M_PI * 5 / n) * i));
a[ 6] = complex_mult( a[ 6], exp_i((-2. * M_PI * 3 / n) * i));
a[ 7] = complex_mult( a[ 7], exp_i((-2. * M_PI * 7 / n) * i));
}
inline void twiddle16(float2 *a, int i, int n )
{
a[ 0] = complex_mult( a[ 0], exp_i((-2. * M_PI * 0 / n) * i));
a[ 1] = complex_mult( a[ 1], exp_i((-2. * M_PI * 8 / n) * i));
a[ 2] = complex_mult( a[ 2], exp_i((-2. * M_PI * 4 / n) * i));
a[ 3] = complex_mult( a[ 3], exp_i((-2. * M_PI * 12 / n) * i));
a[ 4] = complex_mult( a[ 4], exp_i((-2. * M_PI * 2 / n) * i));
a[ 5] = complex_mult( a[ 5], exp_i((-2. * M_PI * 10 / n) * i));
a[ 6] = complex_mult( a[ 6], exp_i((-2. * M_PI * 6 / n) * i));
a[ 7] = complex_mult( a[ 7], exp_i((-2. * M_PI * 14 / n) * i));
a[ 8] = complex_mult( a[ 8], exp_i((-2. * M_PI * 1 / n) * i));
a[ 9] = complex_mult( a[ 9], exp_i((-2. * M_PI * 9 / n) * i));
a[10] = complex_mult( a[10], exp_i((-2. * M_PI * 5 / n) * i));
a[11] = complex_mult( a[11], exp_i((-2. * M_PI * 13 / n) * i));
a[12] = complex_mult( a[12], exp_i((-2. * M_PI * 3 / n) * i));
a[13] = complex_mult( a[13], exp_i((-2. * M_PI * 11 / n) * i));
a[14] = complex_mult( a[14], exp_i((-2. * M_PI * 7 / n) * i));
a[15] = complex_mult( a[15], exp_i((-2. * M_PI * 15 / n) * i));
}
#endif // PRECOMPUTE_FACTORS
#else // UNROLL_LOOPS
#ifdef PRECOMPUTE_FACTORS
inline void twiddle16(float2 *a, global float2 *factors, int n )
{
for( int j = 0; j < 16; j++ )
a[j] = complex_mult( a[j], factors[11]);
}
#else // PRECOMPUTE_FACTORS
inline void twiddle4( float2 *a, int i, int n )
{
int rev[8] = {0,2,1,3};
for( int j = 1; j < 4; j++ )
a[j] = complex_mult(a[j], exp_i((-2 * M_PI * rev[j] / n) * i));
}
inline void twiddle8( float2 *a, int i, int n )
{
int rev[8] = {0,4,2,6,1,5,3,7};
for( int j = 1; j < 8; j++ )
a[j] = complex_mult(a[j], exp_i((-2 * M_PI * rev[j] / n) * i));
}
inline void twiddle16( float2 *a, int i, int n )
{
int rev[16] = { 0, 8, 4,12, 2,10, 6,14, 1, 9, 5,13, 3,11, 7,15};
for( int j = 1; j < 16; j++ )
a[j] = complex_mult(a[j], exp_i((-2 * M_PI * rev[j] / n) * i));
}
#endif // PRECOMPUTE_FACTORS
#endif // UNROLL_LOOPS
#ifdef PRECOMPUTE_FACTORS
inline void twiddle4x4(float2 *a, global float2 *factors)
{
float2 w;
w = factors[0];
a[1 ] = complex_mult(a[ 1], w);
a[5 ] = complex_mult(a[ 5], w);
a[9 ] = complex_mult(a[ 9], w);
a[13] = complex_mult(a[13], w);
w = factors[1];
a[2 ] = complex_mult(a[ 2], w);
a[6 ] = complex_mult(a[ 6], w);
a[10] = complex_mult(a[10], w);
a[14] = complex_mult(a[14], w);
w = factors[2];
a[3 ] = complex_mult(a[ 3], w);
a[7 ] = complex_mult(a[ 7], w);
a[11] = complex_mult(a[11], w);
a[15] = complex_mult(a[15], w);
}
#else // PRECOMPUTE_FACTORS
inline void twiddle4x4(float2 *a, int i )
{
float2 w1 = exp_i((-2. * M_PI/32.) * i);
a[1 ] = complex_mult(a[ 1], w1);
a[5 ] = complex_mult(a[ 5], w1);
a[9 ] = complex_mult(a[ 9], w1);
a[13] = complex_mult(a[13], w1);
float2 w2 = exp_i((-1. * M_PI/32.) * i);
a[2 ] = complex_mult(a[ 2], w2);
a[6 ] = complex_mult(a[ 6], w2);
a[10] = complex_mult(a[10], w2);
a[14] = complex_mult(a[14], w2);
float2 w3 = exp_i((-3. * M_PI/32.) * i);
a[3 ] = complex_mult(a[ 3], w3);
a[7 ] = complex_mult(a[ 7], w3);
a[11] = complex_mult(a[11], w3);
a[15] = complex_mult(a[15], w3);
}
#endif // PRECOMPUTE_FACTORS
// 1024 element 1D FFT
kernel __attribute__((reqd_work_group_size(64, 1, 1)))
#ifdef PRECOMPUTE_FACTORS
void fft1D_1024 (global float2 *src,
global float2 *dst,
global float2 *twiddleFactors)
{
#else
void fft1D_1024 (global float2 *src,
global float2 *dst)
{
#endif
int tid = get_local_id(0);
int iblock = get_group_id(0);
int index = iblock * 1024 + tid;
src += index;
dst += index;
int hi4 = tid>>4;
int lo4 = tid&15;
int hi2 = tid>>4;
int mi2 = (tid>>2)&3;
int lo2 = tid&3;
float2 a[16];
local float smem[69*16];
load16(a, src, 64);
FFT16 ( a );
#ifdef PRECOMPUTE_FACTORS
twiddle16(a, &twiddleFactors[16*tid], 1024);
#else // PRECOMPUTE_FACTORS
twiddle16(a, tid, 1024);
#endif// PRECOMPUTE_FACTORS
transpose16( a, &smem[lo4*65+hi4 ], 4, &smem[lo4*65+hi4*4] );
FFT4x4( a );
#ifdef PRECOMPUTE_FACTORS
twiddle4x4( a, &twiddleFactors[1024 + 3*lo4] );
#else // PRECOMPUTE_FACTORS
twiddle4x4( a, lo4 );
#endif// PRECOMPUTE_FACTORS
transpose4x4( a, &smem[hi2*17 + mi2*4 + lo2], 69,
&smem[mi2*69*4 + hi2*69 + lo2*17], 1);
FFT16 ( a );
store16(a, dst);
}
// 512 element 1D FFT
kernel __attribute__((reqd_work_group_size(64, 1, 1)))
#ifdef PRECOMPUTE_FACTORS
void fft1D_512 (global float2 *src,
global float2 *dst,
global float2 *twiddleFactors)
{
#else
void fft1D_512 (global float2 *src,
global float2 *dst)
{
#endif
int tid = get_local_id(0);
int iblock = get_group_id(0);
int index = iblock * 512 + tid;
src += index;
dst += index;
int hi = tid >> 3; // High-order bits (3)
int lo = tid & 7; // Low-order bits (3)
float2 a[8];
local float smem[8*8*9];
load8(a, src, 64);
FFT8(a);
#ifdef PRECOMPUTE_FACTORS
twiddle8(a, tid, twiddleFactors[8 * tid], 512);
#else // PRECOMPUTE_FACTORS
twiddle8(a, tid, 512);
#endif// PRECOMPUTE_FACTORS
transpose8(a, &smem[hi*8+lo], 66, &smem[lo*66+hi], 8);
FFT8(a);
#ifdef PRECOMPUTE_FACTORS
twiddle8(a, hi, twiddleFactors[512 + lo], 64); // this is probably wrong
#else // PRECOMPUTE_FACTORS
twiddle8(a, hi, 64);
#endif// PRECOMPUTE_FACTORS
transpose8(a, &smem[hi*8+lo], 8*9, &smem[hi*8*9+lo], 8);
FFT8(a);
store8( a, dst);
}
// 4 element 1D FFT
kernel __attribute__((reqd_work_group_size(64, 1, 1)))
void fft1D_4 (global float2 *src,
global float2 *dst)
{
int tid = get_local_id(0);
int bid = get_group_id(0);
int lo = bid & (2048/4/64-1);
int hi = bid &~(2048/4/64-1);
int i = lo*64 + tid;
float2 a[4];
load4(a, src, 512);
FFT4(&a[0], &a[1], &a[2], &a[3]);
twiddle4(a, i, 2048);
store4(a, dst, 512);
}