I launch the following (simplified) kernel with local group dimensions of 16x16 and global dimensions of 128x128. Everything is fine, if width and height are equal to 128. However if I set width to 127 (or a smaller value), such that there are some idle threads, this kernel hangs. This is still true for the new 195.39 beta (I’ve got a GeForce 8800 Ultra and use the 32bit XP version).
[codebox]#define BLOCK_WIDTH 16
#define BLOCK_HEIGHT 16
__kernel void hang(__global uchar *in, __global uchar *out,
unsigned int width, unsigned int height)
{
const int posx = get_global_id(0);
const int posy = get_global_id(1);
__local uchar local_block[BLOCK_WIDTH*BLOCK_HEIGHT];
const int linear = posy*width + posx;
if (posx < width && posy < height)
local_block[get_local_id(0)+get_local_id(1)*BLOCK_WIDTH] = 255;
barrier(CLK_LOCAL_MEM_FENCE);
if (posx < width && posy < height)
out[linear] = local_block[get_local_id(0)+get_local_id(1)*BLOCK_WIDTH];
return;
}[/codebox]