Hi,

this is probably a really newbie question, but I really do need help here conceptually.

So please excuse me, if my questions or problems are simple.

I’m trying to implement a Radix-4 FFT on a kernel. Problem is, I think that it should be

really parallel friendly, but I don’t have a good idea of how to organize threads.

I reworked the code, so that instead the innermost for loop I have threads,

and this does work correctly for 4^6 samples. Anything higher,

and I can’t have a thread for a sample, and my code falls apart, giving strange results.

I think that it’s mainly due to synchronisation, and I probably should make the grid 2d,

but I don’t really have any good idea of how to do it.

(the cuComplex struct was lifted from CUDA by Examples book)

Any ideas would be really appreciated.

```
#include "math.h"
#ifndef PI
#define PI 3.14159265358979
#endif
struct cuComplex {
float r;
float i;
cuComplex( float a, float b ) : r(a), i(b) {}
__device__ float magnitude2( void ) {
return r * r + i * i;
}
__device__ cuComplex operator*(const cuComplex& a) {
return cuComplex(r*a.r - i*a.i, i*a.r + r*a.i);
}
__device__ cuComplex operator+(const cuComplex& a) {
return cuComplex(r+a.r, i+a.i);
}
__device__ cuComplex operator-(const cuComplex& a) {
return cuComplex(r-a.r, i-a.i);
}
};
__global__ void radix(float *re_s, float *im_s, int p ,int N) {
int n=blockIdx.x * blockDim.x + threadIdx.x;
int iy=blockIdx.y * blockDim.y + threadIdx.y;
int M = N/4;
cuComplex w(0.0,0.0), a(0.0,0.0), b(0.0,0.0),c(0.0,0.0),d(0.0,0.0), temp(0.0,0.0),
x1(0.0,0.0),x2(0.0,0.0),x3(0.0,0.0),x4(0.0,0.0), j(0.0, 1.0), mj(0.0,-1.0);
for (int stage = 1; stage <=p; stage++) {
int k=iy*(N/(pow(4.0,stage-1)));
if (n <= M-1 && k <= N-1) {
x1.r = re_s[n+1+k];
x1.i = im_s[n+1+k];
x2.r = re_s[n+(M)+1+k];
x2.i = im_s[n+(M)+1+k];
x3.r = re_s[n+(2*M)+1+k];
x3.i = im_s[n+(2*M)+1+k];
x4.r = re_s[n+(3*M)+1+k];
x4.i = im_s[n+(3*M)+1+k];
a = x1+x2+x3+x4;
b = x1 + mj*x2-x3+j*x4;
c= x1-x2+x3-x4;
d = x1+j*x2-x3+mj*x4;
///////////////////////////////////////////////////////////////////////////////
///////// 1 /////////////
w.r = cos(((-2.0*PI)/N)* (0) *(pow(4.0,stage-1)));
w.i = sin(((-2.0*PI)/N)* (0) *(pow(4.0,stage-1)));
temp = w*a;
// temp = complmul(w, a);
re_s[n+1+k] = temp.r;
im_s[n+1+k] = temp.i;
///////// 2 /////////////
w.r = cos(((-2.0*PI)/N)* (n*1) *(pow(4.0,stage-1)));
w.i = sin(((-2.0*PI)/N)* (n*1) *(pow(4.0,stage-1)));
// temp = complmul(w, b);
temp = w*b;
re_s[M+n+1+k] = temp.r;
im_s[M+n+1+k] = temp.i;
///////// 3 /////////////
w.r = cos(((-2.0*PI)/N)* (n*2) *(pow(4.0,stage-1)));
w.i = sin(((-2.0*PI)/N)* (n*2) *(pow(4.0,stage-1)));
// temp = complmul(w, c);
temp = w*c;
re_s[M*2+n+1+k] = temp.r;
im_s[M*2+n+1+k] = temp.i;
///////// 4 /////////////
w.r = cos(((-2.0*PI)/N)* (n*3) *(pow(4.0,stage-1)));
w.i = sin(((-2.0*PI)/N)* (n*3) *(pow(4.0,stage-1)));
// temp = complmul(w, d);
temp = w*d;
re_s[M*3+n+1+k] = temp.r;
im_s[M*3+n+1+k] = temp.i;
}
M=M/4;
}
}
```