Problem with idle threads and barrier

I launch the following (simplified) kernel with local group dimensions of 16x16 and global dimensions of 128x128. Everything is fine, if width and height are equal to 128. However if I set width to 127 (or a smaller value), such that there are some idle threads, this kernel hangs. This is still true for the new 195.39 beta (I’ve got a GeForce 8800 Ultra and use the 32bit XP version).

[codebox]#define BLOCK_WIDTH 16

#define BLOCK_HEIGHT 16

__kernel void hang(__global uchar *in, __global uchar *out,

               unsigned int width, unsigned int height)

{

const int posx = get_global_id(0);

const int posy = get_global_id(1);

__local uchar local_block[BLOCK_WIDTH*BLOCK_HEIGHT];

const int linear = posy*width + posx;

if (posx < width && posy < height)

local_block[get_local_id(0)+get_local_id(1)*BLOCK_WIDTH] = 255;

barrier(CLK_LOCAL_MEM_FENCE);

if (posx < width && posy < height)

out[linear] = local_block[get_local_id(0)+get_local_id(1)*BLOCK_WIDTH];

return;

}[/codebox]