Hi I have a piece of code where each thread writes to it’s own shared memory location. Afterwards I perform a __syncthreads(); and then read shared memory (also from the neighbours). Unfortunately one thread gets different values as another threads, even though the element read should be the same. The code follows:
__global__ void facet_kernel (unsigned int blocks_offset, int2 g_blocklist){
__shared__ int s_block_x, s_block_y;
__shared__ unsigned int s_facet_hit[16*16];
unsigned int tid = threadIdx.x + threadIdx.y * 16;
if (tid==0) {
int2 number = g_blocklist[blockIdx.x + blocks_offset];
s_block_x = number.x;
s_block_y = number.y;
}
// s_facet_hit[tid] = 0; // normally we do not hit a facet
s_facet_hit[tid] = tid; // For debugging purposes
__syncthreads();
unsigned int reflection_plane;
bool found_reflection = false;
// CODE TO FIND REFLECTIONS
// SKIPPED IT HERE
if (found_reflection) {
s_facet_hit[tid] = reflection_plane + 1;
}
__syncthreads();
// Write thread indices of a 3x3 block around the current thread (on the right some extra debugging output)
if ((s_block_x==-2) && (s_block_y==-44) && (tid==136)) {
unsigned int index = atomicInc(d_debug_offset, gridDim.x*14*14);
d_debug[index] = make_float4((float) tid-17, \
(float) tid-16, \
(float) tid-15, \
(float) s_facet_hit[tid+16]);
unsigned int index_1 = atomicInc(d_debug_offset, gridDim.x*14*14);
d_debug[index_1] = make_float4((float) tid-1, \
(float) tid, \
(float) tid+1, \
(float) s_facet_hit[tid+16]);
unsigned int index_2 = atomicInc(d_debug_offset, gridDim.x*14*14);
d_debug[index_2] = make_float4((float) tid+15, \
(float) tid+16, \
(float) tid+17, \
(float) s_facet_hit[tid+16]);
// Write the values in shared memory of a 3x3 block around the current thread (on the right some extra debugging output)
if ((s_block_x==-2) && (s_block_y==-44) && (tid==136)) {
unsigned int index = atomicInc(d_debug_offset, gridDim.x*14*14);
d_debug[index] = make_float4((float) s_facet_hit[tid-17], \
(float) s_facet_hit[tid-16], \
(float) s_facet_hit[tid-15], \
(float) s_facet_hit[tid+16]);
unsigned int index_1 = atomicInc(d_debug_offset, gridDim.x*14*14);
d_debug[index_1] = make_float4((float) s_facet_hit[tid-1], \
(float) s_facet_hit[tid], \
(float) s_facet_hit[tid+1], \
(float) s_facet_hit[tid+32]);
unsigned int index_2 = atomicInc(d_debug_offset, gridDim.x*14*14);
d_debug[index_2] = make_float4((float) s_facet_hit[tid+15], \
(float) s_facet_hit[tid+16], \
(float) s_facet_hit[tid+17], \
(float) s_facet_hit[tid+33]);
}
}
}
Blocksize is 16x16, so tid has the value:
0 1 ... 14 15
16 .. 31
.
.
.
240 .... 255
Output is
tid==136
135.0000 136.0000 137.0000 4.0000
151.0000 152.0000 153.0000 4.0000
167.0000 168.0000 169.0000 4.0000
135.0000 136.0000 137.0000 4.0000
151.0000 4.0000 153.0000 4.0000
167.0000 4.0000 4.0000 4.0000
tid == 152
119.0000 120.0000 121.0000 152.0000
135.0000 136.0000 137.0000 152.0000
151.0000 152.0000 153.0000 152.0000
119.0000 120.0000 121.0000 152.0000
135.0000 136.0000 137.0000 168.0000
151.0000 152.0000 153.0000 169.0000
As you can see, in the case for tid==152, the value at index 152 remains unchanged (152), whereas for tid==136, the value is overwritten with 4 (reflection_plane==3)
Given the __syncthreads(); just before the code where debugging-output is extracted, this seems to me like a CUDA bug.
On XP64:
Driver 185.85
nvcc --version
nvcc: NVIDIA ® Cuda compiler driver
Copyright © 2005-2009 NVIDIA Corporation
Built on Sat_May__2_05:54:12_PDT_2009
Cuda compilation tools, release 2.2, V0.2.1221
Has the bug. Upgrading XP64 to the latest toolkit & driver fixes the problem. Linux64 with the latest toolkit & driver is also correctly functioning. Was there a bug with __syncthreads() fixed between 2.2 and 2.3???
P.S. I cannot change the topic-name unfortunately