FFT code doesn't compile, why? Geforce 8600 GT / Driver 195.62 / Win7 x64

Hello there,

I’m trying to use the OpenCL FFT library that has recently been released by apple, as I didn’t find anything else for big FFTs using OpenCL yet. The apple source automatically generates kernel code, but this code doesn’t compile.

It must have something to do with the define that is used there, fftKernel16, but I don’t get where the error is, everything looks fine to me. The compiler complains with an error though…

The errors are:

:230: error: cannot codegen this l-value expression yet

fftKernel16(a, dir);

^~~~~~~~~~~

:230: error: cannot codegen this l-value expression yet

fftKernel16(a, dir);

^~~~~~~~~~~

:230: error: cannot codegen this l-value expression yet

fftKernel16(a, dir);

^~~~~~~~~~~

.....................................................

The error appears 32 times.

This is the automatically generated source code by the apple FFT library. It’s based on an FFT plan for a 8192 size 1D fft, search for “THIS IS WHAT CAUSES THE ERROR ********************************” to find the error location.

Any help?

Thanks,

Nils

#ifndef M_PI

#define M_PI 0x1.921fb54442d18p+1

#endif

#define complexMul(a,b) ((float2)(mad(-(a).y, (b).y, (a).x * (b).x), mad((a).y, (b).x, (a).x * (b).y)))

#define conj(a) ((float2)((a).x, -(a).y))

#define conjTransp(a) ((float2)(-(a).y, (a).x))

#define fftKernel2(a,dir) \

{ \

	float2 c = (a)[0];	\

	(a)[0] = c + (a)[1];  \

	(a)[1] = c - (a)[1];  \

}

#define fftKernel2S(d1,d2,dir) \

{ \

	float2 c = (d1);   \

	(d1) = c + (d2);   \

	(d2) = c - (d2);   \

}

#define fftKernel4(a,dir) \

{ \

	fftKernel2S((a)[0], (a)[2], dir); \

	fftKernel2S((a)[1], (a)[3], dir); \

	fftKernel2S((a)[0], (a)[1], dir); \

	(a)[3] = (float2)(dir)*(conjTransp((a)[3])); \

	fftKernel2S((a)[2], (a)[3], dir); \

	float2 c = (a)[1]; \

	(a)[1] = (a)[2]; \

	(a)[2] = c; \

}

#define fftKernel4s(a0,a1,a2,a3,dir) \

{ \

	fftKernel2S((a0), (a2), dir); \

	fftKernel2S((a1), (a3), dir); \

	fftKernel2S((a0), (a1), dir); \

	(a3) = (float2)(dir)*(conjTransp((a3))); \

	fftKernel2S((a2), (a3), dir); \

	float2 c = (a1); \

	(a1) = (a2); \

	(a2) = c; \

}

#define bitreverse8(a) \

{ \

	float2 c; \

	c = (a)[1]; \

	(a)[1] = (a)[4]; \

	(a)[4] = c; \

	c = (a)[3]; \

	(a)[3] = (a)[6]; \

	(a)[6] = c; \

}

#define fftKernel8(a,dir) \

{ \

	const float2 w1  = (float2)(0x1.6a09e6p-1f,  dir*0x1.6a09e6p-1f);  \

	const float2 w3  = (float2)(-0x1.6a09e6p-1f, dir*0x1.6a09e6p-1f);  \

	float2 c; \

	fftKernel2S((a)[0], (a)[4], dir); \

	fftKernel2S((a)[1], (a)[5], dir); \

	fftKernel2S((a)[2], (a)[6], dir); \

	fftKernel2S((a)[3], (a)[7], dir); \

	(a)[5] = complexMul(w1, (a)[5]); \

	(a)[6] = (float2)(dir)*(conjTransp((a)[6])); \

	(a)[7] = complexMul(w3, (a)[7]); \

	fftKernel2S((a)[0], (a)[2], dir); \

	fftKernel2S((a)[1], (a)[3], dir); \

	fftKernel2S((a)[4], (a)[6], dir); \

	fftKernel2S((a)[5], (a)[7], dir); \

	(a)[3] = (float2)(dir)*(conjTransp((a)[3])); \

	(a)[7] = (float2)(dir)*(conjTransp((a)[7])); \

	fftKernel2S((a)[0], (a)[1], dir); \

	fftKernel2S((a)[2], (a)[3], dir); \

	fftKernel2S((a)[4], (a)[5], dir); \

	fftKernel2S((a)[6], (a)[7], dir); \

	bitreverse8((a)); \

}

#define bitreverse4x4(a) \

{ \

	float2 c; \

	c = (a)[1];  (a)[1]  = (a)[4];  (a)[4]  = c; \

	c = (a)[2];  (a)[2]  = (a)[8];  (a)[8]  = c; \

	c = (a)[3];  (a)[3]  = (a)[12]; (a)[12] = c; \

	c = (a)[6];  (a)[6]  = (a)[9];  (a)[9]  = c; \

	c = (a)[7];  (a)[7]  = (a)[13]; (a)[13] = c; \

	c = (a)[11]; (a)[11] = (a)[14]; (a)[14] = c; \

}

#define fftKernel16(a,dir) \

{ \

	const float w0 = 0x1.d906bcp-1f; \

	const float w1 = 0x1.87de2ap-2f; \

	const float w2 = 0x1.6a09e6p-1f; \

	fftKernel4s((a)[0], (a)[4], (a)[8],  (a)[12], dir); \

	fftKernel4s((a)[1], (a)[5], (a)[9],  (a)[13], dir); \

	fftKernel4s((a)[2], (a)[6], (a)[10], (a)[14], dir); \

	fftKernel4s((a)[3], (a)[7], (a)[11], (a)[15], dir); \

	(a)[5]  = complexMul((a)[5], (float2)(w0, dir*w1)); \

	(a)[6]  = complexMul((a)[6], (float2)(w2, dir*w2)); \

	(a)[7]  = complexMul((a)[7], (float2)(w1, dir*w0)); \

	(a)[9]  = complexMul((a)[9], (float2)(w2, dir*w2)); \

	(a)[10] = (float2)(dir)*(conjTransp((a)[10])); \

	(a)[11] = complexMul((a)[11], (float2)(-w2, dir*w2)); \

	(a)[13] = complexMul((a)[13], (float2)(w1, dir*w0)); \

	(a)[14] = complexMul((a)[14], (float2)(-w2, dir*w2)); \

	(a)[15] = complexMul((a)[15], (float2)(-w0, dir*-w1)); \

	fftKernel4((a), dir); \

	fftKernel4((a) + 4, dir); \

	fftKernel4((a) + 8, dir); \

	fftKernel4((a) + 12, dir); \

	bitreverse4x4((a)); \

}

#define bitreverse32(a) \

{ \

	float2 c1, c2; \

	c1 = (a)[2];   (a)[2] = (a)[1];   c2 = (a)[4];   (a)[4] = c1;   c1 = (a)[8];   (a)[8] = c2;	c2 = (a)[16];  (a)[16] = c1;   (a)[1] = c2; \

	c1 = (a)[6];   (a)[6] = (a)[3];   c2 = (a)[12];  (a)[12] = c1;  c1 = (a)[24];  (a)[24] = c2;   c2 = (a)[17];  (a)[17] = c1;   (a)[3] = c2; \

	c1 = (a)[10];  (a)[10] = (a)[5];  c2 = (a)[20];  (a)[20] = c1;  c1 = (a)[9];   (a)[9] = c2;	c2 = (a)[18];  (a)[18] = c1;   (a)[5] = c2; \

	c1 = (a)[14];  (a)[14] = (a)[7];  c2 = (a)[28];  (a)[28] = c1;  c1 = (a)[25];  (a)[25] = c2;   c2 = (a)[19];  (a)[19] = c1;   (a)[7] = c2; \

	c1 = (a)[22];  (a)[22] = (a)[11]; c2 = (a)[13];  (a)[13] = c1;  c1 = (a)[26];  (a)[26] = c2;   c2 = (a)[21];  (a)[21] = c1;   (a)[11] = c2; \

	c1 = (a)[30];  (a)[30] = (a)[15]; c2 = (a)[29];  (a)[29] = c1;  c1 = (a)[27];  (a)[27] = c2;   c2 = (a)[23];  (a)[23] = c1;   (a)[15] = c2; \

}

#define fftKernel32(a,dir) \

{ \

	fftKernel2S((a)[0],  (a)[16], dir); \

	fftKernel2S((a)[1],  (a)[17], dir); \

	fftKernel2S((a)[2],  (a)[18], dir); \

	fftKernel2S((a)[3],  (a)[19], dir); \

	fftKernel2S((a)[4],  (a)[20], dir); \

	fftKernel2S((a)[5],  (a)[21], dir); \

	fftKernel2S((a)[6],  (a)[22], dir); \

	fftKernel2S((a)[7],  (a)[23], dir); \

	fftKernel2S((a)[8],  (a)[24], dir); \

	fftKernel2S((a)[9],  (a)[25], dir); \

	fftKernel2S((a)[10], (a)[26], dir); \

	fftKernel2S((a)[11], (a)[27], dir); \

	fftKernel2S((a)[12], (a)[28], dir); \

	fftKernel2S((a)[13], (a)[29], dir); \

	fftKernel2S((a)[14], (a)[30], dir); \

	fftKernel2S((a)[15], (a)[31], dir); \

	(a)[17] = complexMul((a)[17], (float2)(0x1.f6297cp-1f, dir*0x1.8f8b84p-3f)); \

	(a)[18] = complexMul((a)[18], (float2)(0x1.d906bcp-1f, dir*0x1.87de2ap-2f)); \

	(a)[19] = complexMul((a)[19], (float2)(0x1.a9b662p-1f, dir*0x1.1c73b4p-1f)); \

	(a)[20] = complexMul((a)[20], (float2)(0x1.6a09e6p-1f, dir*0x1.6a09e6p-1f)); \

	(a)[21] = complexMul((a)[21], (float2)(0x1.1c73b4p-1f, dir*0x1.a9b662p-1f)); \

	(a)[22] = complexMul((a)[22], (float2)(0x1.87de2ap-2f, dir*0x1.d906bcp-1f)); \

	(a)[23] = complexMul((a)[23], (float2)(0x1.8f8b84p-3f, dir*0x1.f6297cp-1f)); \

	(a)[24] = complexMul((a)[24], (float2)(0x0p+0f, dir*0x1p+0f)); \

	(a)[25] = complexMul((a)[25], (float2)(-0x1.8f8b84p-3f, dir*0x1.f6297cp-1f)); \

	(a)[26] = complexMul((a)[26], (float2)(-0x1.87de2ap-2f, dir*0x1.d906bcp-1f)); \

	(a)[27] = complexMul((a)[27], (float2)(-0x1.1c73b4p-1f, dir*0x1.a9b662p-1f)); \

	(a)[28] = complexMul((a)[28], (float2)(-0x1.6a09e6p-1f, dir*0x1.6a09e6p-1f)); \

	(a)[29] = complexMul((a)[29], (float2)(-0x1.a9b662p-1f, dir*0x1.1c73b4p-1f)); \

	(a)[30] = complexMul((a)[30], (float2)(-0x1.d906bcp-1f, dir*0x1.87de2ap-2f)); \

	(a)[31] = complexMul((a)[31], (float2)(-0x1.f6297cp-1f, dir*0x1.8f8b84p-3f)); \

	fftKernel16((a), dir); \

	fftKernel16((a) + 16, dir); \

	bitreverse32((a)); \

}

__kernel void \

clFFT_1DTwistInterleaved(__global float2 *in, unsigned int startRow, unsigned int numCols, unsigned int N, unsigned int numRowsToProcess, int dir) \

{ \

   float2 a, w; \

   float ang; \

   unsigned int j; \

	unsigned int i = get_global_id(0); \

	unsigned int startIndex = mad24(startRow, numCols, i); \

	 \

	if(i < numCols) \

	{ \

		for(j = 0; j < numRowsToProcess; j++) \

		{ \

			a = in[startIndex]; \

			ang = 2.0f * M_PI * dir * i * (startRow + j) / N; \

			w = (float2)(native_cos(ang), native_sin(ang)); \

			a = complexMul(a, w); \

			in[startIndex] = a; \

			startIndex += numCols; \

		} \

	}	 \

} \

__kernel void fft0(__global float2 *in, __global float2 *out, int dir, int S)

{

	__local float sMem[2064];

	int i, j, r, indexIn, indexOut, index, tid, bNum, xNum, k, l;

	int s, ii, jj, offset;

	float2 w;

	float ang, angf, ang1;

	__local float *lMemStore, *lMemLoad;

	float2 a[16];

	int lId = get_local_id( 0 );

	int groupId = get_group_id( 0 );

bNum = groupId & 3;

xNum = groupId >> 2;

indexIn = mul24(bNum, 16);

tid = indexIn;

i = tid >> 0;

j = tid & 0;

indexOut = mad24(i, 128, j);

indexIn += (xNum << 13);

indexOut += (xNum << 13);

tid = lId;

i = tid & 15;

j = tid >> 4;

indexIn += mad24(j, 64, i);

in += indexIn;

a[0] = in[0];

a[1] = in[512];

a[2] = in[1024];

a[3] = in[1536];

a[4] = in[2048];

a[5] = in[2560];

a[6] = in[3072];

a[7] = in[3584];

a[8] = in[4096];

a[9] = in[4608];

a[10] = in[5120];

a[11] = in[5632];

a[12] = in[6144];

a[13] = in[6656];

a[14] = in[7168];

a[15] = in[7680];

fftKernel16(a, dir); // THIS IS WHAT CAUSES THE ERROR ************************************************

ang = dir*(2.0f*M_PI*1/128)*j;

w = (float2)(native_cos(ang), native_sin(ang));

a[1] = complexMul(a[1], w);

ang = dir*(2.0f*M_PI*2/128)*j;

w = (float2)(native_cos(ang), native_sin(ang));

a[2] = complexMul(a[2], w);

ang = dir*(2.0f*M_PI*3/128)*j;

w = (float2)(native_cos(ang), native_sin(ang));

a[3] = complexMul(a[3], w);

ang = dir*(2.0f*M_PI*4/128)*j;

w = (float2)(native_cos(ang), native_sin(ang));

a[4] = complexMul(a[4], w);

ang = dir*(2.0f*M_PI*5/128)*j;

w = (float2)(native_cos(ang), native_sin(ang));

a[5] = complexMul(a[5], w);

ang = dir*(2.0f*M_PI*6/128)*j;

w = (float2)(native_cos(ang), native_sin(ang));

a[6] = complexMul(a[6], w);

ang = dir*(2.0f*M_PI*7/128)*j;

w = (float2)(native_cos(ang), native_sin(ang));

a[7] = complexMul(a[7], w);

ang = dir*(2.0f*M_PI*8/128)*j;

w = (float2)(native_cos(ang), native_sin(ang));

a[8] = complexMul(a[8], w);

ang = dir*(2.0f*M_PI*9/128)*j;

w = (float2)(native_cos(ang), native_sin(ang));

a[9] = complexMul(a[9], w);

ang = dir*(2.0f*M_PI*10/128)*j;

w = (float2)(native_cos(ang), native_sin(ang));

a[10] = complexMul(a[10], w);

ang = dir*(2.0f*M_PI*11/128)*j;

w = (float2)(native_cos(ang), native_sin(ang));

a[11] = complexMul(a[11], w);

ang = dir*(2.0f*M_PI*12/128)*j;

w = (float2)(native_cos(ang), native_sin(ang));

a[12] = complexMul(a[12], w);

ang = dir*(2.0f*M_PI*13/128)*j;

w = (float2)(native_cos(ang), native_sin(ang));

a[13] = complexMul(a[13], w);

ang = dir*(2.0f*M_PI*14/128)*j;

w = (float2)(native_cos(ang), native_sin(ang));

a[14] = complexMul(a[14], w);

ang = dir*(2.0f*M_PI*15/128)*j;

w = (float2)(native_cos(ang), native_sin(ang));

a[15] = complexMul(a[15], w);

indexIn = mad24(j, 256, i);

lMemStore = sMem + tid;

lMemLoad = sMem + indexIn;

lMemStore[0] = a[0].x;

lMemStore[128] = a[1].x;

lMemStore[256] = a[2].x;

lMemStore[384] = a[3].x;

lMemStore[512] = a[4].x;

lMemStore[640] = a[5].x;

lMemStore[768] = a[6].x;

lMemStore[896] = a[7].x;

lMemStore[1024] = a[8].x;

lMemStore[1152] = a[9].x;

lMemStore[1280] = a[10].x;

lMemStore[1408] = a[11].x;

lMemStore[1536] = a[12].x;

lMemStore[1664] = a[13].x;

lMemStore[1792] = a[14].x;

lMemStore[1920] = a[15].x;

barrier(CLK_LOCAL_MEM_FENCE);

a[0].x = lMemLoad[0];

a[1].x = lMemLoad[16];

a[2].x = lMemLoad[32];

a[3].x = lMemLoad[48];

a[4].x = lMemLoad[64];

a[5].x = lMemLoad[80];

a[6].x = lMemLoad[96];

a[7].x = lMemLoad[112];

a[8].x = lMemLoad[128];

a[9].x = lMemLoad[144];

a[10].x = lMemLoad[160];

a[11].x = lMemLoad[176];

a[12].x = lMemLoad[192];

a[13].x = lMemLoad[208];

a[14].x = lMemLoad[224];

a[15].x = lMemLoad[240];

barrier(CLK_LOCAL_MEM_FENCE);

lMemStore[0] = a[0].y;

lMemStore[128] = a[1].y;

lMemStore[256] = a[2].y;

lMemStore[384] = a[3].y;

lMemStore[512] = a[4].y;

lMemStore[640] = a[5].y;

lMemStore[768] = a[6].y;

lMemStore[896] = a[7].y;

lMemStore[1024] = a[8].y;

lMemStore[1152] = a[9].y;

lMemStore[1280] = a[10].y;

lMemStore[1408] = a[11].y;

lMemStore[1536] = a[12].y;

lMemStore[1664] = a[13].y;

lMemStore[1792] = a[14].y;

lMemStore[1920] = a[15].y;

barrier(CLK_LOCAL_MEM_FENCE);

a[0].y = lMemLoad[0];

a[1].y = lMemLoad[16];

a[2].y = lMemLoad[32];

a[3].y = lMemLoad[48];

a[4].y = lMemLoad[64];

a[5].y = lMemLoad[80];

a[6].y = lMemLoad[96];

a[7].y = lMemLoad[112];

a[8].y = lMemLoad[128];

a[9].y = lMemLoad[144];

a[10].y = lMemLoad[160];

a[11].y = lMemLoad[176];

a[12].y = lMemLoad[192];

a[13].y = lMemLoad[208];

a[14].y = lMemLoad[224];

a[15].y = lMemLoad[240];

barrier(CLK_LOCAL_MEM_FENCE);

fftKernel8(a + 0, dir);

fftKernel8(a + 8, dir);

l = ((bNum << 4) + i) >> 0;

k = j << 1;

ang1 = dir*(2.0f*M_PI/8192)*l;

ang = ang1*(k + 0);

w = (float2)(native_cos(ang), native_sin(ang));

a[0] = complexMul(a[0], w);

ang = ang1*(k + 16);

w = (float2)(native_cos(ang), native_sin(ang));

a[1] = complexMul(a[1], w);

ang = ang1*(k + 32);

w = (float2)(native_cos(ang), native_sin(ang));

a[2] = complexMul(a[2], w);

ang = ang1*(k + 48);

w = (float2)(native_cos(ang), native_sin(ang));

a[3] = complexMul(a[3], w);

ang = ang1*(k + 64);

w = (float2)(native_cos(ang), native_sin(ang));

a[4] = complexMul(a[4], w);

ang = ang1*(k + 80);

w = (float2)(native_cos(ang), native_sin(ang));

a[5] = complexMul(a[5], w);

ang = ang1*(k + 96);

w = (float2)(native_cos(ang), native_sin(ang));

a[6] = complexMul(a[6], w);

ang = ang1*(k + 112);

w = (float2)(native_cos(ang), native_sin(ang));

a[7] = complexMul(a[7], w);

ang = ang1*(k + 1);

w = (float2)(native_cos(ang), native_sin(ang));

a[8] = complexMul(a[8], w);

ang = ang1*(k + 17);

w = (float2)(native_cos(ang), native_sin(ang));

a[9] = complexMul(a[9], w);

ang = ang1*(k + 33);

w = (float2)(native_cos(ang), native_sin(ang));

a[10] = complexMul(a[10], w);

ang = ang1*(k + 49);

w = (float2)(native_cos(ang), native_sin(ang));

a[11] = complexMul(a[11], w);

ang = ang1*(k + 65);

w = (float2)(native_cos(ang), native_sin(ang));

a[12] = complexMul(a[12], w);

ang = ang1*(k + 81);

w = (float2)(native_cos(ang), native_sin(ang));

a[13] = complexMul(a[13], w);

ang = ang1*(k + 97);

w = (float2)(native_cos(ang), native_sin(ang));

a[14] = complexMul(a[14], w);

ang = ang1*(k + 113);

w = (float2)(native_cos(ang), native_sin(ang));

a[15] = complexMul(a[15], w);

lMemStore = sMem + mad24(i, 129, j << 1);

lMemLoad = sMem + mad24(tid >> 7, 129, tid & 127);

lMemStore[ 0] = a[0].x;

lMemStore[ 16] = a[1].x;

lMemStore[ 32] = a[2].x;

lMemStore[ 48] = a[3].x;

lMemStore[ 64] = a[4].x;

lMemStore[ 80] = a[5].x;

lMemStore[ 96] = a[6].x;

lMemStore[ 112] = a[7].x;

lMemStore[ 1] = a[8].x;

lMemStore[ 17] = a[9].x;

lMemStore[ 33] = a[10].x;

lMemStore[ 49] = a[11].x;

lMemStore[ 65] = a[12].x;

lMemStore[ 81] = a[13].x;

lMemStore[ 97] = a[14].x;

lMemStore[ 113] = a[15].x;

barrier(CLK_LOCAL_MEM_FENCE);

a[0].x = lMemLoad[0];

a[1].x = lMemLoad[129];

a[2].x = lMemLoad[258];

a[3].x = lMemLoad[387];

a[4].x = lMemLoad[516];

a[5].x = lMemLoad[645];

a[6].x = lMemLoad[774];

a[7].x = lMemLoad[903];

a[8].x = lMemLoad[1032];

a[9].x = lMemLoad[1161];

a[10].x = lMemLoad[1290];

a[11].x = lMemLoad[1419];

a[12].x = lMemLoad[1548];

a[13].x = lMemLoad[1677];

a[14].x = lMemLoad[1806];

a[15].x = lMemLoad[1935];

barrier(CLK_LOCAL_MEM_FENCE);

lMemStore[ 0] = a[0].y;

lMemStore[ 16] = a[1].y;

lMemStore[ 32] = a[2].y;

lMemStore[ 48] = a[3].y;

lMemStore[ 64] = a[4].y;

lMemStore[ 80] = a[5].y;

lMemStore[ 96] = a[6].y;

lMemStore[ 112] = a[7].y;

lMemStore[ 1] = a[8].y;

lMemStore[ 17] = a[9].y;

lMemStore[ 33] = a[10].y;

lMemStore[ 49] = a[11].y;

lMemStore[ 65] = a[12].y;

lMemStore[ 81] = a[13].y;

lMemStore[ 97] = a[14].y;

lMemStore[ 113] = a[15].y;

barrier(CLK_LOCAL_MEM_FENCE);

a[0].y = lMemLoad[0];

a[1].y = lMemLoad[129];

a[2].y = lMemLoad[258];

a[3].y = lMemLoad[387];

a[4].y = lMemLoad[516];

a[5].y = lMemLoad[645];

a[6].y = lMemLoad[774];

a[7].y = lMemLoad[903];

a[8].y = lMemLoad[1032];

a[9].y = lMemLoad[1161];

a[10].y = lMemLoad[1290];

a[11].y = lMemLoad[1419];

a[12].y = lMemLoad[1548];

a[13].y = lMemLoad[1677];

a[14].y = lMemLoad[1806];

a[15].y = lMemLoad[1935];

barrier(CLK_LOCAL_MEM_FENCE);

indexOut += tid;

out += indexOut;

out[0] = a[0];

out[128] = a[1];

out[256] = a[2];

out[384] = a[3];

out[512] = a[4];

out[640] = a[5];

out[768] = a[6];

out[896] = a[7];

out[1024] = a[8];

out[1152] = a[9];

out[1280] = a[10];

out[1408] = a[11];

out[1536] = a[12];

out[1664] = a[13];

out[1792] = a[14];

out[1920] = a[15];

}

__kernel void fft1(__global float2 *in, __global float2 *out, int dir, int S)

{

	__local float sMem[1024];

	int i, j, r, indexIn, indexOut, index, tid, bNum, xNum, k, l;

	int s, ii, jj, offset;

	float2 w;

	float ang, angf, ang1;

	__local float *lMemStore, *lMemLoad;

	float2 a[8];

	int lId = get_local_id( 0 );

	int groupId = get_group_id( 0 );

bNum = groupId & 7;

xNum = groupId >> 3;

indexIn = mul24(bNum, 16);

tid = indexIn;

i = tid >> 7;

j = tid & 127;

indexOut = mad24(i, 8192, j);

indexIn += (xNum << 13);

indexOut += (xNum << 13);

tid = lId;

i = tid & 15;

j = tid >> 4;

indexIn += mad24(j, 128, i);

in += indexIn;

a[0] = in[0];

a[1] = in[1024];

a[2] = in[2048];

a[3] = in[3072];

a[4] = in[4096];

a[5] = in[5120];

a[6] = in[6144];

a[7] = in[7168];

fftKernel8(a, dir);

ang = dir*(2.0f*M_PI*1/64)*j;

w = (float2)(native_cos(ang), native_sin(ang));

a[1] = complexMul(a[1], w);

ang = dir*(2.0f*M_PI*2/64)*j;

w = (float2)(native_cos(ang), native_sin(ang));

a[2] = complexMul(a[2], w);

ang = dir*(2.0f*M_PI*3/64)*j;

w = (float2)(native_cos(ang), native_sin(ang));

a[3] = complexMul(a[3], w);

ang = dir*(2.0f*M_PI*4/64)*j;

w = (float2)(native_cos(ang), native_sin(ang));

a[4] = complexMul(a[4], w);

ang = dir*(2.0f*M_PI*5/64)*j;

w = (float2)(native_cos(ang), native_sin(ang));

a[5] = complexMul(a[5], w);

ang = dir*(2.0f*M_PI*6/64)*j;

w = (float2)(native_cos(ang), native_sin(ang));

a[6] = complexMul(a[6], w);

ang = dir*(2.0f*M_PI*7/64)*j;

w = (float2)(native_cos(ang), native_sin(ang));

a[7] = complexMul(a[7], w);

indexIn = mad24(j, 128, i);

lMemStore = sMem + tid;

lMemLoad = sMem + indexIn;

lMemStore[0] = a[0].x;

lMemStore[128] = a[1].x;

lMemStore[256] = a[2].x;

lMemStore[384] = a[3].x;

lMemStore[512] = a[4].x;

lMemStore[640] = a[5].x;

lMemStore[768] = a[6].x;

lMemStore[896] = a[7].x;

barrier(CLK_LOCAL_MEM_FENCE);

a[0].x = lMemLoad[0];

a[1].x = lMemLoad[16];

a[2].x = lMemLoad[32];

a[3].x = lMemLoad[48];

a[4].x = lMemLoad[64];

a[5].x = lMemLoad[80];

a[6].x = lMemLoad[96];

a[7].x = lMemLoad[112];

barrier(CLK_LOCAL_MEM_FENCE);

lMemStore[0] = a[0].y;

lMemStore[128] = a[1].y;

lMemStore[256] = a[2].y;

lMemStore[384] = a[3].y;

lMemStore[512] = a[4].y;

lMemStore[640] = a[5].y;

lMemStore[768] = a[6].y;

lMemStore[896] = a[7].y;

barrier(CLK_LOCAL_MEM_FENCE);

a[0].y = lMemLoad[0];

a[1].y = lMemLoad[16];

a[2].y = lMemLoad[32];

a[3].y = lMemLoad[48];

a[4].y = lMemLoad[64];

a[5].y = lMemLoad[80];

a[6].y = lMemLoad[96];

a[7].y = lMemLoad[112];

barrier(CLK_LOCAL_MEM_FENCE);

fftKernel8(a + 0, dir);

indexOut += mad24(j, 128, i);

out += indexOut;

out[0] = a[0];

out[1024] = a[1];

out[2048] = a[2];

out[3072] = a[3];

out[4096] = a[4];

out[5120] = a[5];

out[6144] = a[6];

out[7168] = a[7];

}

I am having the same exact issue. I was porting the apple opencl fft code to my linux box, not that that would matter much. It seems to not always happen though, some of the example sizes work in param.txt, some dont. So, it must have something to do with a size or something. Not sure…

Anybody have any ideas??
Thanks!

Meanwhile I figured it out :)

Take a look at fft_base_kernels.h, see line 4 of “baseKernels”, the complexMul line. The define seems to be too complicated to the NVidia OpenCL compiler, I replaced the define by a function and it’s now working:

float2 complexMul(float2 a,float2 b) { return (float2)(mad(-(a).y, (b).y, (a).x * (b).x), mad((a).y, (b).x, (a).x * (b).y));}

Have fun :)

Nils

Sure enough, works for me as well. Good catch!
Thanks!

HI !

Does someone has port the FFT code from Apple to Linux? or should I do it myself?..

Thx for help

Same question for windows :ph34r:
With some minor modification, I was able to compile the lib but not the main.cpp. However I hope that it’ll work.

Haven’t googled it yet but I have some missing .h, (<mach/mach_time.h>, <Accelerate/Accelerate.h>). I’m looking to know if these missing files are critical or if there exist equivalent or if they are only used for benchmarking purpose.

Just looked deeply for the missing libs, Accelerate.h includes the proprietary computing framework of Apple so it seems like a simple port will be difficult on Linux as on Windows :(

edit : after a closer look, the Accelerate framework is just used to compute reference FFT’s. It should work then :woot:

Yes, but <mach/mach_time.h>, <Accelerate/Accelerate.h> libs are just used to compute the FFT with le CPU and compare if the result is the same than with the GPU. So you can drop this part.

For me the FFT (2D) compute correctly on linux.

Since the 3.0 version, i have a big compilation time problem of my FFT kernel code. Its make more than 150sec…

Do I am the only one with this problem?

If someone have a idea…

Thx

Did you solve it?

I have the same issue since i tried to convert it to double precision.

thanks for answering

Did you solve it?

I have the same issue since i tried to convert it to double precision.

thanks for answering

I have also encountered some bugs in nvidia’s C preprocessor where it spits out errors when it shouldn’t. My workaround is the following:

  • Add a post-build event to the project and put the following commands in it (assuming your source file is called kernels.cl):

rem This command uses the MS C compiler to preprocess the source file. By default it puts the output in kernels.i
cl /P kernels.cl
move kernels.i kernelsPreprocessed.cl

  • When you load your kernel file in the host program, load kernelsPreprocessed.cl instead of the original.

This method will avoid any bugs in Nvidia’s C preprocessor by simply using microsoft’s C preprocessor instead. I’m sure you can do something similar using gcc if you’re developing on linux, just add those lines to your makefile. The only problem with this is if you modify kernels.cl you have to either rebuild your host project or run the above commands manually.

I have also encountered some bugs in nvidia’s C preprocessor where it spits out errors when it shouldn’t. My workaround is the following:

  • Add a post-build event to the project and put the following commands in it (assuming your source file is called kernels.cl):

rem This command uses the MS C compiler to preprocess the source file. By default it puts the output in kernels.i
cl /P kernels.cl
move kernels.i kernelsPreprocessed.cl

  • When you load your kernel file in the host program, load kernelsPreprocessed.cl instead of the original.

This method will avoid any bugs in Nvidia’s C preprocessor by simply using microsoft’s C preprocessor instead. I’m sure you can do something similar using gcc if you’re developing on linux, just add those lines to your makefile. The only problem with this is if you modify kernels.cl you have to either rebuild your host project or run the above commands manually.

Thanks for trying to help.

In this OpenCL FFT library (and i prefer using it as a library and not integrate it in my source code), the OpenCL kernels code is “generated” at run-time.
You have a main header who contains 2/3 strings with the major part of the code and then during execution, the code uses a lot af strings-functions to build a string that contains only what you and an “optimised” code for the case you want to study.
In a first approch, i thought it was this stage who cost time but it appears that this auto-generate string is long to compile.
I already have tried, as you suggest, to output this string, precompile it with gcc -E and then try to compile this new file in OpenCL but that makes no difference.

Thanks for trying to help.

In this OpenCL FFT library (and i prefer using it as a library and not integrate it in my source code), the OpenCL kernels code is “generated” at run-time.
You have a main header who contains 2/3 strings with the major part of the code and then during execution, the code uses a lot af strings-functions to build a string that contains only what you and an “optimised” code for the case you want to study.
In a first approch, i thought it was this stage who cost time but it appears that this auto-generate string is long to compile.
I already have tried, as you suggest, to output this string, precompile it with gcc -E and then try to compile this new file in OpenCL but that makes no difference.