I wrote a simple DFT kernel that accepts pointers to global input & output arrays. Works great until the size of the arrays (N) grows above 32K then it crashes the device. Does anybody know what limitation I’m running into?
GeForce GTS 250 graphics card
Kernel:
[codebox]
typedef struct
{
float r;
float i;
} complex;
__kernel void dft( __global complex *fin,
__global complex *fout,
const int N )
{
float2 tmp, tot;
int n;
float twiddle_r, twiddle_i;
n = (int)get_global_id(0);
tot.x = 0;
tot.y = 0;
for (int k = 0 ; k < N ; k++)
{
twiddle_r = sincos( (float)((6.283185307179586476925286766559f * k * n) / N), &(twiddle_i) );
tmp.x = fin[k].r * twiddle_r - fin[k].i * twiddle_i;
tmp.y = fin[k].r * twiddle_i + fin[k].i * twiddle_r;
tot.x += tmp.x;
tot.y += tmp.y;
}
write_mem_fence(CLK_GLOBAL_MEM_FENCE);
fout[n].r = tot.x;
fout[n].i = tot.y;
}
[/codebox]