Hello there,
I’m trying to use the OpenCL FFT library that has recently been released by apple, as I didn’t find anything else for big FFTs using OpenCL yet. The apple source automatically generates kernel code, but this code doesn’t compile.
It must have something to do with the define that is used there, fftKernel16, but I don’t get where the error is, everything looks fine to me. The compiler complains with an error though…
The errors are:
:230: error: cannot codegen this l-value expression yet
fftKernel16(a, dir);
^~~~~~~~~~~
:230: error: cannot codegen this l-value expression yet
fftKernel16(a, dir);
^~~~~~~~~~~
:230: error: cannot codegen this l-value expression yet
fftKernel16(a, dir);
^~~~~~~~~~~
.....................................................
The error appears 32 times.
This is the automatically generated source code by the apple FFT library. It’s based on an FFT plan for a 8192 size 1D fft, search for “THIS IS WHAT CAUSES THE ERROR ********************************” to find the error location.
Any help?
Thanks,
Nils
#ifndef M_PI
#define M_PI 0x1.921fb54442d18p+1
#endif
#define complexMul(a,b) ((float2)(mad(-(a).y, (b).y, (a).x * (b).x), mad((a).y, (b).x, (a).x * (b).y)))
#define conj(a) ((float2)((a).x, -(a).y))
#define conjTransp(a) ((float2)(-(a).y, (a).x))
#define fftKernel2(a,dir) \
{ \
float2 c = (a)[0]; \
(a)[0] = c + (a)[1]; \
(a)[1] = c - (a)[1]; \
}
#define fftKernel2S(d1,d2,dir) \
{ \
float2 c = (d1); \
(d1) = c + (d2); \
(d2) = c - (d2); \
}
#define fftKernel4(a,dir) \
{ \
fftKernel2S((a)[0], (a)[2], dir); \
fftKernel2S((a)[1], (a)[3], dir); \
fftKernel2S((a)[0], (a)[1], dir); \
(a)[3] = (float2)(dir)*(conjTransp((a)[3])); \
fftKernel2S((a)[2], (a)[3], dir); \
float2 c = (a)[1]; \
(a)[1] = (a)[2]; \
(a)[2] = c; \
}
#define fftKernel4s(a0,a1,a2,a3,dir) \
{ \
fftKernel2S((a0), (a2), dir); \
fftKernel2S((a1), (a3), dir); \
fftKernel2S((a0), (a1), dir); \
(a3) = (float2)(dir)*(conjTransp((a3))); \
fftKernel2S((a2), (a3), dir); \
float2 c = (a1); \
(a1) = (a2); \
(a2) = c; \
}
#define bitreverse8(a) \
{ \
float2 c; \
c = (a)[1]; \
(a)[1] = (a)[4]; \
(a)[4] = c; \
c = (a)[3]; \
(a)[3] = (a)[6]; \
(a)[6] = c; \
}
#define fftKernel8(a,dir) \
{ \
const float2 w1 = (float2)(0x1.6a09e6p-1f, dir*0x1.6a09e6p-1f); \
const float2 w3 = (float2)(-0x1.6a09e6p-1f, dir*0x1.6a09e6p-1f); \
float2 c; \
fftKernel2S((a)[0], (a)[4], dir); \
fftKernel2S((a)[1], (a)[5], dir); \
fftKernel2S((a)[2], (a)[6], dir); \
fftKernel2S((a)[3], (a)[7], dir); \
(a)[5] = complexMul(w1, (a)[5]); \
(a)[6] = (float2)(dir)*(conjTransp((a)[6])); \
(a)[7] = complexMul(w3, (a)[7]); \
fftKernel2S((a)[0], (a)[2], dir); \
fftKernel2S((a)[1], (a)[3], dir); \
fftKernel2S((a)[4], (a)[6], dir); \
fftKernel2S((a)[5], (a)[7], dir); \
(a)[3] = (float2)(dir)*(conjTransp((a)[3])); \
(a)[7] = (float2)(dir)*(conjTransp((a)[7])); \
fftKernel2S((a)[0], (a)[1], dir); \
fftKernel2S((a)[2], (a)[3], dir); \
fftKernel2S((a)[4], (a)[5], dir); \
fftKernel2S((a)[6], (a)[7], dir); \
bitreverse8((a)); \
}
#define bitreverse4x4(a) \
{ \
float2 c; \
c = (a)[1]; (a)[1] = (a)[4]; (a)[4] = c; \
c = (a)[2]; (a)[2] = (a)[8]; (a)[8] = c; \
c = (a)[3]; (a)[3] = (a)[12]; (a)[12] = c; \
c = (a)[6]; (a)[6] = (a)[9]; (a)[9] = c; \
c = (a)[7]; (a)[7] = (a)[13]; (a)[13] = c; \
c = (a)[11]; (a)[11] = (a)[14]; (a)[14] = c; \
}
#define fftKernel16(a,dir) \
{ \
const float w0 = 0x1.d906bcp-1f; \
const float w1 = 0x1.87de2ap-2f; \
const float w2 = 0x1.6a09e6p-1f; \
fftKernel4s((a)[0], (a)[4], (a)[8], (a)[12], dir); \
fftKernel4s((a)[1], (a)[5], (a)[9], (a)[13], dir); \
fftKernel4s((a)[2], (a)[6], (a)[10], (a)[14], dir); \
fftKernel4s((a)[3], (a)[7], (a)[11], (a)[15], dir); \
(a)[5] = complexMul((a)[5], (float2)(w0, dir*w1)); \
(a)[6] = complexMul((a)[6], (float2)(w2, dir*w2)); \
(a)[7] = complexMul((a)[7], (float2)(w1, dir*w0)); \
(a)[9] = complexMul((a)[9], (float2)(w2, dir*w2)); \
(a)[10] = (float2)(dir)*(conjTransp((a)[10])); \
(a)[11] = complexMul((a)[11], (float2)(-w2, dir*w2)); \
(a)[13] = complexMul((a)[13], (float2)(w1, dir*w0)); \
(a)[14] = complexMul((a)[14], (float2)(-w2, dir*w2)); \
(a)[15] = complexMul((a)[15], (float2)(-w0, dir*-w1)); \
fftKernel4((a), dir); \
fftKernel4((a) + 4, dir); \
fftKernel4((a) + 8, dir); \
fftKernel4((a) + 12, dir); \
bitreverse4x4((a)); \
}
#define bitreverse32(a) \
{ \
float2 c1, c2; \
c1 = (a)[2]; (a)[2] = (a)[1]; c2 = (a)[4]; (a)[4] = c1; c1 = (a)[8]; (a)[8] = c2; c2 = (a)[16]; (a)[16] = c1; (a)[1] = c2; \
c1 = (a)[6]; (a)[6] = (a)[3]; c2 = (a)[12]; (a)[12] = c1; c1 = (a)[24]; (a)[24] = c2; c2 = (a)[17]; (a)[17] = c1; (a)[3] = c2; \
c1 = (a)[10]; (a)[10] = (a)[5]; c2 = (a)[20]; (a)[20] = c1; c1 = (a)[9]; (a)[9] = c2; c2 = (a)[18]; (a)[18] = c1; (a)[5] = c2; \
c1 = (a)[14]; (a)[14] = (a)[7]; c2 = (a)[28]; (a)[28] = c1; c1 = (a)[25]; (a)[25] = c2; c2 = (a)[19]; (a)[19] = c1; (a)[7] = c2; \
c1 = (a)[22]; (a)[22] = (a)[11]; c2 = (a)[13]; (a)[13] = c1; c1 = (a)[26]; (a)[26] = c2; c2 = (a)[21]; (a)[21] = c1; (a)[11] = c2; \
c1 = (a)[30]; (a)[30] = (a)[15]; c2 = (a)[29]; (a)[29] = c1; c1 = (a)[27]; (a)[27] = c2; c2 = (a)[23]; (a)[23] = c1; (a)[15] = c2; \
}
#define fftKernel32(a,dir) \
{ \
fftKernel2S((a)[0], (a)[16], dir); \
fftKernel2S((a)[1], (a)[17], dir); \
fftKernel2S((a)[2], (a)[18], dir); \
fftKernel2S((a)[3], (a)[19], dir); \
fftKernel2S((a)[4], (a)[20], dir); \
fftKernel2S((a)[5], (a)[21], dir); \
fftKernel2S((a)[6], (a)[22], dir); \
fftKernel2S((a)[7], (a)[23], dir); \
fftKernel2S((a)[8], (a)[24], dir); \
fftKernel2S((a)[9], (a)[25], dir); \
fftKernel2S((a)[10], (a)[26], dir); \
fftKernel2S((a)[11], (a)[27], dir); \
fftKernel2S((a)[12], (a)[28], dir); \
fftKernel2S((a)[13], (a)[29], dir); \
fftKernel2S((a)[14], (a)[30], dir); \
fftKernel2S((a)[15], (a)[31], dir); \
(a)[17] = complexMul((a)[17], (float2)(0x1.f6297cp-1f, dir*0x1.8f8b84p-3f)); \
(a)[18] = complexMul((a)[18], (float2)(0x1.d906bcp-1f, dir*0x1.87de2ap-2f)); \
(a)[19] = complexMul((a)[19], (float2)(0x1.a9b662p-1f, dir*0x1.1c73b4p-1f)); \
(a)[20] = complexMul((a)[20], (float2)(0x1.6a09e6p-1f, dir*0x1.6a09e6p-1f)); \
(a)[21] = complexMul((a)[21], (float2)(0x1.1c73b4p-1f, dir*0x1.a9b662p-1f)); \
(a)[22] = complexMul((a)[22], (float2)(0x1.87de2ap-2f, dir*0x1.d906bcp-1f)); \
(a)[23] = complexMul((a)[23], (float2)(0x1.8f8b84p-3f, dir*0x1.f6297cp-1f)); \
(a)[24] = complexMul((a)[24], (float2)(0x0p+0f, dir*0x1p+0f)); \
(a)[25] = complexMul((a)[25], (float2)(-0x1.8f8b84p-3f, dir*0x1.f6297cp-1f)); \
(a)[26] = complexMul((a)[26], (float2)(-0x1.87de2ap-2f, dir*0x1.d906bcp-1f)); \
(a)[27] = complexMul((a)[27], (float2)(-0x1.1c73b4p-1f, dir*0x1.a9b662p-1f)); \
(a)[28] = complexMul((a)[28], (float2)(-0x1.6a09e6p-1f, dir*0x1.6a09e6p-1f)); \
(a)[29] = complexMul((a)[29], (float2)(-0x1.a9b662p-1f, dir*0x1.1c73b4p-1f)); \
(a)[30] = complexMul((a)[30], (float2)(-0x1.d906bcp-1f, dir*0x1.87de2ap-2f)); \
(a)[31] = complexMul((a)[31], (float2)(-0x1.f6297cp-1f, dir*0x1.8f8b84p-3f)); \
fftKernel16((a), dir); \
fftKernel16((a) + 16, dir); \
bitreverse32((a)); \
}
__kernel void \
clFFT_1DTwistInterleaved(__global float2 *in, unsigned int startRow, unsigned int numCols, unsigned int N, unsigned int numRowsToProcess, int dir) \
{ \
float2 a, w; \
float ang; \
unsigned int j; \
unsigned int i = get_global_id(0); \
unsigned int startIndex = mad24(startRow, numCols, i); \
\
if(i < numCols) \
{ \
for(j = 0; j < numRowsToProcess; j++) \
{ \
a = in[startIndex]; \
ang = 2.0f * M_PI * dir * i * (startRow + j) / N; \
w = (float2)(native_cos(ang), native_sin(ang)); \
a = complexMul(a, w); \
in[startIndex] = a; \
startIndex += numCols; \
} \
} \
} \
__kernel void fft0(__global float2 *in, __global float2 *out, int dir, int S)
{
__local float sMem[2064];
int i, j, r, indexIn, indexOut, index, tid, bNum, xNum, k, l;
int s, ii, jj, offset;
float2 w;
float ang, angf, ang1;
__local float *lMemStore, *lMemLoad;
float2 a[16];
int lId = get_local_id( 0 );
int groupId = get_group_id( 0 );
bNum = groupId & 3;
xNum = groupId >> 2;
indexIn = mul24(bNum, 16);
tid = indexIn;
i = tid >> 0;
j = tid & 0;
indexOut = mad24(i, 128, j);
indexIn += (xNum << 13);
indexOut += (xNum << 13);
tid = lId;
i = tid & 15;
j = tid >> 4;
indexIn += mad24(j, 64, i);
in += indexIn;
a[0] = in[0];
a[1] = in[512];
a[2] = in[1024];
a[3] = in[1536];
a[4] = in[2048];
a[5] = in[2560];
a[6] = in[3072];
a[7] = in[3584];
a[8] = in[4096];
a[9] = in[4608];
a[10] = in[5120];
a[11] = in[5632];
a[12] = in[6144];
a[13] = in[6656];
a[14] = in[7168];
a[15] = in[7680];
fftKernel16(a, dir); // THIS IS WHAT CAUSES THE ERROR ************************************************
ang = dir*(2.0f*M_PI*1/128)*j;
w = (float2)(native_cos(ang), native_sin(ang));
a[1] = complexMul(a[1], w);
ang = dir*(2.0f*M_PI*2/128)*j;
w = (float2)(native_cos(ang), native_sin(ang));
a[2] = complexMul(a[2], w);
ang = dir*(2.0f*M_PI*3/128)*j;
w = (float2)(native_cos(ang), native_sin(ang));
a[3] = complexMul(a[3], w);
ang = dir*(2.0f*M_PI*4/128)*j;
w = (float2)(native_cos(ang), native_sin(ang));
a[4] = complexMul(a[4], w);
ang = dir*(2.0f*M_PI*5/128)*j;
w = (float2)(native_cos(ang), native_sin(ang));
a[5] = complexMul(a[5], w);
ang = dir*(2.0f*M_PI*6/128)*j;
w = (float2)(native_cos(ang), native_sin(ang));
a[6] = complexMul(a[6], w);
ang = dir*(2.0f*M_PI*7/128)*j;
w = (float2)(native_cos(ang), native_sin(ang));
a[7] = complexMul(a[7], w);
ang = dir*(2.0f*M_PI*8/128)*j;
w = (float2)(native_cos(ang), native_sin(ang));
a[8] = complexMul(a[8], w);
ang = dir*(2.0f*M_PI*9/128)*j;
w = (float2)(native_cos(ang), native_sin(ang));
a[9] = complexMul(a[9], w);
ang = dir*(2.0f*M_PI*10/128)*j;
w = (float2)(native_cos(ang), native_sin(ang));
a[10] = complexMul(a[10], w);
ang = dir*(2.0f*M_PI*11/128)*j;
w = (float2)(native_cos(ang), native_sin(ang));
a[11] = complexMul(a[11], w);
ang = dir*(2.0f*M_PI*12/128)*j;
w = (float2)(native_cos(ang), native_sin(ang));
a[12] = complexMul(a[12], w);
ang = dir*(2.0f*M_PI*13/128)*j;
w = (float2)(native_cos(ang), native_sin(ang));
a[13] = complexMul(a[13], w);
ang = dir*(2.0f*M_PI*14/128)*j;
w = (float2)(native_cos(ang), native_sin(ang));
a[14] = complexMul(a[14], w);
ang = dir*(2.0f*M_PI*15/128)*j;
w = (float2)(native_cos(ang), native_sin(ang));
a[15] = complexMul(a[15], w);
indexIn = mad24(j, 256, i);
lMemStore = sMem + tid;
lMemLoad = sMem + indexIn;
lMemStore[0] = a[0].x;
lMemStore[128] = a[1].x;
lMemStore[256] = a[2].x;
lMemStore[384] = a[3].x;
lMemStore[512] = a[4].x;
lMemStore[640] = a[5].x;
lMemStore[768] = a[6].x;
lMemStore[896] = a[7].x;
lMemStore[1024] = a[8].x;
lMemStore[1152] = a[9].x;
lMemStore[1280] = a[10].x;
lMemStore[1408] = a[11].x;
lMemStore[1536] = a[12].x;
lMemStore[1664] = a[13].x;
lMemStore[1792] = a[14].x;
lMemStore[1920] = a[15].x;
barrier(CLK_LOCAL_MEM_FENCE);
a[0].x = lMemLoad[0];
a[1].x = lMemLoad[16];
a[2].x = lMemLoad[32];
a[3].x = lMemLoad[48];
a[4].x = lMemLoad[64];
a[5].x = lMemLoad[80];
a[6].x = lMemLoad[96];
a[7].x = lMemLoad[112];
a[8].x = lMemLoad[128];
a[9].x = lMemLoad[144];
a[10].x = lMemLoad[160];
a[11].x = lMemLoad[176];
a[12].x = lMemLoad[192];
a[13].x = lMemLoad[208];
a[14].x = lMemLoad[224];
a[15].x = lMemLoad[240];
barrier(CLK_LOCAL_MEM_FENCE);
lMemStore[0] = a[0].y;
lMemStore[128] = a[1].y;
lMemStore[256] = a[2].y;
lMemStore[384] = a[3].y;
lMemStore[512] = a[4].y;
lMemStore[640] = a[5].y;
lMemStore[768] = a[6].y;
lMemStore[896] = a[7].y;
lMemStore[1024] = a[8].y;
lMemStore[1152] = a[9].y;
lMemStore[1280] = a[10].y;
lMemStore[1408] = a[11].y;
lMemStore[1536] = a[12].y;
lMemStore[1664] = a[13].y;
lMemStore[1792] = a[14].y;
lMemStore[1920] = a[15].y;
barrier(CLK_LOCAL_MEM_FENCE);
a[0].y = lMemLoad[0];
a[1].y = lMemLoad[16];
a[2].y = lMemLoad[32];
a[3].y = lMemLoad[48];
a[4].y = lMemLoad[64];
a[5].y = lMemLoad[80];
a[6].y = lMemLoad[96];
a[7].y = lMemLoad[112];
a[8].y = lMemLoad[128];
a[9].y = lMemLoad[144];
a[10].y = lMemLoad[160];
a[11].y = lMemLoad[176];
a[12].y = lMemLoad[192];
a[13].y = lMemLoad[208];
a[14].y = lMemLoad[224];
a[15].y = lMemLoad[240];
barrier(CLK_LOCAL_MEM_FENCE);
fftKernel8(a + 0, dir);
fftKernel8(a + 8, dir);
l = ((bNum << 4) + i) >> 0;
k = j << 1;
ang1 = dir*(2.0f*M_PI/8192)*l;
ang = ang1*(k + 0);
w = (float2)(native_cos(ang), native_sin(ang));
a[0] = complexMul(a[0], w);
ang = ang1*(k + 16);
w = (float2)(native_cos(ang), native_sin(ang));
a[1] = complexMul(a[1], w);
ang = ang1*(k + 32);
w = (float2)(native_cos(ang), native_sin(ang));
a[2] = complexMul(a[2], w);
ang = ang1*(k + 48);
w = (float2)(native_cos(ang), native_sin(ang));
a[3] = complexMul(a[3], w);
ang = ang1*(k + 64);
w = (float2)(native_cos(ang), native_sin(ang));
a[4] = complexMul(a[4], w);
ang = ang1*(k + 80);
w = (float2)(native_cos(ang), native_sin(ang));
a[5] = complexMul(a[5], w);
ang = ang1*(k + 96);
w = (float2)(native_cos(ang), native_sin(ang));
a[6] = complexMul(a[6], w);
ang = ang1*(k + 112);
w = (float2)(native_cos(ang), native_sin(ang));
a[7] = complexMul(a[7], w);
ang = ang1*(k + 1);
w = (float2)(native_cos(ang), native_sin(ang));
a[8] = complexMul(a[8], w);
ang = ang1*(k + 17);
w = (float2)(native_cos(ang), native_sin(ang));
a[9] = complexMul(a[9], w);
ang = ang1*(k + 33);
w = (float2)(native_cos(ang), native_sin(ang));
a[10] = complexMul(a[10], w);
ang = ang1*(k + 49);
w = (float2)(native_cos(ang), native_sin(ang));
a[11] = complexMul(a[11], w);
ang = ang1*(k + 65);
w = (float2)(native_cos(ang), native_sin(ang));
a[12] = complexMul(a[12], w);
ang = ang1*(k + 81);
w = (float2)(native_cos(ang), native_sin(ang));
a[13] = complexMul(a[13], w);
ang = ang1*(k + 97);
w = (float2)(native_cos(ang), native_sin(ang));
a[14] = complexMul(a[14], w);
ang = ang1*(k + 113);
w = (float2)(native_cos(ang), native_sin(ang));
a[15] = complexMul(a[15], w);
lMemStore = sMem + mad24(i, 129, j << 1);
lMemLoad = sMem + mad24(tid >> 7, 129, tid & 127);
lMemStore[ 0] = a[0].x;
lMemStore[ 16] = a[1].x;
lMemStore[ 32] = a[2].x;
lMemStore[ 48] = a[3].x;
lMemStore[ 64] = a[4].x;
lMemStore[ 80] = a[5].x;
lMemStore[ 96] = a[6].x;
lMemStore[ 112] = a[7].x;
lMemStore[ 1] = a[8].x;
lMemStore[ 17] = a[9].x;
lMemStore[ 33] = a[10].x;
lMemStore[ 49] = a[11].x;
lMemStore[ 65] = a[12].x;
lMemStore[ 81] = a[13].x;
lMemStore[ 97] = a[14].x;
lMemStore[ 113] = a[15].x;
barrier(CLK_LOCAL_MEM_FENCE);
a[0].x = lMemLoad[0];
a[1].x = lMemLoad[129];
a[2].x = lMemLoad[258];
a[3].x = lMemLoad[387];
a[4].x = lMemLoad[516];
a[5].x = lMemLoad[645];
a[6].x = lMemLoad[774];
a[7].x = lMemLoad[903];
a[8].x = lMemLoad[1032];
a[9].x = lMemLoad[1161];
a[10].x = lMemLoad[1290];
a[11].x = lMemLoad[1419];
a[12].x = lMemLoad[1548];
a[13].x = lMemLoad[1677];
a[14].x = lMemLoad[1806];
a[15].x = lMemLoad[1935];
barrier(CLK_LOCAL_MEM_FENCE);
lMemStore[ 0] = a[0].y;
lMemStore[ 16] = a[1].y;
lMemStore[ 32] = a[2].y;
lMemStore[ 48] = a[3].y;
lMemStore[ 64] = a[4].y;
lMemStore[ 80] = a[5].y;
lMemStore[ 96] = a[6].y;
lMemStore[ 112] = a[7].y;
lMemStore[ 1] = a[8].y;
lMemStore[ 17] = a[9].y;
lMemStore[ 33] = a[10].y;
lMemStore[ 49] = a[11].y;
lMemStore[ 65] = a[12].y;
lMemStore[ 81] = a[13].y;
lMemStore[ 97] = a[14].y;
lMemStore[ 113] = a[15].y;
barrier(CLK_LOCAL_MEM_FENCE);
a[0].y = lMemLoad[0];
a[1].y = lMemLoad[129];
a[2].y = lMemLoad[258];
a[3].y = lMemLoad[387];
a[4].y = lMemLoad[516];
a[5].y = lMemLoad[645];
a[6].y = lMemLoad[774];
a[7].y = lMemLoad[903];
a[8].y = lMemLoad[1032];
a[9].y = lMemLoad[1161];
a[10].y = lMemLoad[1290];
a[11].y = lMemLoad[1419];
a[12].y = lMemLoad[1548];
a[13].y = lMemLoad[1677];
a[14].y = lMemLoad[1806];
a[15].y = lMemLoad[1935];
barrier(CLK_LOCAL_MEM_FENCE);
indexOut += tid;
out += indexOut;
out[0] = a[0];
out[128] = a[1];
out[256] = a[2];
out[384] = a[3];
out[512] = a[4];
out[640] = a[5];
out[768] = a[6];
out[896] = a[7];
out[1024] = a[8];
out[1152] = a[9];
out[1280] = a[10];
out[1408] = a[11];
out[1536] = a[12];
out[1664] = a[13];
out[1792] = a[14];
out[1920] = a[15];
}
__kernel void fft1(__global float2 *in, __global float2 *out, int dir, int S)
{
__local float sMem[1024];
int i, j, r, indexIn, indexOut, index, tid, bNum, xNum, k, l;
int s, ii, jj, offset;
float2 w;
float ang, angf, ang1;
__local float *lMemStore, *lMemLoad;
float2 a[8];
int lId = get_local_id( 0 );
int groupId = get_group_id( 0 );
bNum = groupId & 7;
xNum = groupId >> 3;
indexIn = mul24(bNum, 16);
tid = indexIn;
i = tid >> 7;
j = tid & 127;
indexOut = mad24(i, 8192, j);
indexIn += (xNum << 13);
indexOut += (xNum << 13);
tid = lId;
i = tid & 15;
j = tid >> 4;
indexIn += mad24(j, 128, i);
in += indexIn;
a[0] = in[0];
a[1] = in[1024];
a[2] = in[2048];
a[3] = in[3072];
a[4] = in[4096];
a[5] = in[5120];
a[6] = in[6144];
a[7] = in[7168];
fftKernel8(a, dir);
ang = dir*(2.0f*M_PI*1/64)*j;
w = (float2)(native_cos(ang), native_sin(ang));
a[1] = complexMul(a[1], w);
ang = dir*(2.0f*M_PI*2/64)*j;
w = (float2)(native_cos(ang), native_sin(ang));
a[2] = complexMul(a[2], w);
ang = dir*(2.0f*M_PI*3/64)*j;
w = (float2)(native_cos(ang), native_sin(ang));
a[3] = complexMul(a[3], w);
ang = dir*(2.0f*M_PI*4/64)*j;
w = (float2)(native_cos(ang), native_sin(ang));
a[4] = complexMul(a[4], w);
ang = dir*(2.0f*M_PI*5/64)*j;
w = (float2)(native_cos(ang), native_sin(ang));
a[5] = complexMul(a[5], w);
ang = dir*(2.0f*M_PI*6/64)*j;
w = (float2)(native_cos(ang), native_sin(ang));
a[6] = complexMul(a[6], w);
ang = dir*(2.0f*M_PI*7/64)*j;
w = (float2)(native_cos(ang), native_sin(ang));
a[7] = complexMul(a[7], w);
indexIn = mad24(j, 128, i);
lMemStore = sMem + tid;
lMemLoad = sMem + indexIn;
lMemStore[0] = a[0].x;
lMemStore[128] = a[1].x;
lMemStore[256] = a[2].x;
lMemStore[384] = a[3].x;
lMemStore[512] = a[4].x;
lMemStore[640] = a[5].x;
lMemStore[768] = a[6].x;
lMemStore[896] = a[7].x;
barrier(CLK_LOCAL_MEM_FENCE);
a[0].x = lMemLoad[0];
a[1].x = lMemLoad[16];
a[2].x = lMemLoad[32];
a[3].x = lMemLoad[48];
a[4].x = lMemLoad[64];
a[5].x = lMemLoad[80];
a[6].x = lMemLoad[96];
a[7].x = lMemLoad[112];
barrier(CLK_LOCAL_MEM_FENCE);
lMemStore[0] = a[0].y;
lMemStore[128] = a[1].y;
lMemStore[256] = a[2].y;
lMemStore[384] = a[3].y;
lMemStore[512] = a[4].y;
lMemStore[640] = a[5].y;
lMemStore[768] = a[6].y;
lMemStore[896] = a[7].y;
barrier(CLK_LOCAL_MEM_FENCE);
a[0].y = lMemLoad[0];
a[1].y = lMemLoad[16];
a[2].y = lMemLoad[32];
a[3].y = lMemLoad[48];
a[4].y = lMemLoad[64];
a[5].y = lMemLoad[80];
a[6].y = lMemLoad[96];
a[7].y = lMemLoad[112];
barrier(CLK_LOCAL_MEM_FENCE);
fftKernel8(a + 0, dir);
indexOut += mad24(j, 128, i);
out += indexOut;
out[0] = a[0];
out[1024] = a[1];
out[2048] = a[2];
out[3072] = a[3];
out[4096] = a[4];
out[5120] = a[5];
out[6144] = a[6];
out[7168] = a[7];
}