I write a small program to check your problem on GTX480, cuda 3.2, the result is correct.

```
__global__ void testAtomicAdd(int *output)
{
int __shared__ AtomicPosition; // it can be also in global memory, the behavior is the same
int __shared__ HashInt[512];
if(threadIdx.x==0)
AtomicPosition=0;
__syncthreads();
if( threadIdx.x % 2 ){
HashInt[atomicAdd(&AtomicPosition, 1)]=threadIdx.x;
}
__syncthreads();
if ( threadIdx.x < AtomicPosition ){
output[threadIdx.x] = HashInt[ threadIdx.x ];
}
}
#define M 64
int main()
{
int output[M];
int *d_out;
cudaMalloc((void**)&d_out, M * sizeof(int));
for(int i = 0; i < M; i++) {
output[i] = -1;
}
cudaMemcpy(d_out, output, M * sizeof(int), cudaMemcpyHostToDevice);
testAtomicAdd<<<1, M >>>( d_out );
cudaMemcpy( output, d_out, M * sizeof(int), cudaMemcpyDeviceToHost);
for(int i = 0; i < M; i++) {
printf("output[%d] = %d\n", i, output[i] );
}
return 0;
}
```

The result is

```
output[0] = 1
output[1] = 33
output[2] = 35
output[3] = 3
output[4] = 37
output[5] = 5
output[6] = 39
output[7] = 7
output[8] = 41
output[9] = 9
output[10] = 43
output[11] = 11
output[12] = 45
output[13] = 13
output[14] = 47
output[15] = 15
output[16] = 49
output[17] = 17
output[18] = 51
output[19] = 19
output[20] = 53
output[21] = 21
output[22] = 55
output[23] = 23
output[24] = 57
output[25] = 25
output[26] = 59
output[27] = 27
output[28] = 61
output[29] = 29
output[30] = 63
output[31] = 31
output[32] = -1
output[33] = -1
output[34] = -1
output[35] = -1
output[36] = -1
output[37] = -1
output[38] = -1
output[39] = -1
output[40] = -1
output[41] = -1
output[42] = -1
output[43] = -1
output[44] = -1
output[45] = -1
output[46] = -1
output[47] = -1
output[48] = -1
output[49] = -1
output[50] = -1
output[51] = -1
output[52] = -1
output[53] = -1
output[54] = -1
output[55] = -1
output[56] = -1
output[57] = -1
output[58] = -1
output[59] = -1
output[60] = -1
output[61] = -1
output[62] = -1
output[63] = -1
```

Could you describe your garbage value?