Hi! Unfortunately I’m struggling to understand why we can’t use an atomic incremented counter to index an array allocated in global memory. Does anyone here have a good explanation why the following (minimal, no error checks) code fail to fill (store) the array properly although count is correctly set? __syncthreads() does not seems to help. What would be the proper CUDA way to accomplish such a task? TIA.
(note: compiled with -arch=sm_20)
__global__ void kernel(int *count, float *stor){
int tx = threadIdx.x + blockIdx.x*blockDim.x;
int ty = threadIdx.y + blockIdx.y*blockDim.y;
if (tx%2) {
stor[*count] = tx;
atomicAdd(count, 1);
}
if (ty%2) {
stor[*count] = ty;
atomicAdd(count, 1);
}
}
int main(){
int *count;
float *stor;
dim3 numThreads(8,8);
dim3 numBlocks(2,2);
int size = numBlocks.x*numBlocks.y*numThreads.x*numThreads.y;
size_t nbytes = size*sizeof(float);
cudaMallocHost(&count, sizeof(int));
cudaMallocHost(&stor, nbytes);
memset(count, 0, sizeof(int));
kernel<<<numBlocks,numThreads>>>(count, stor);
cudaDeviceSynchronize();
printf("count = %d\n", *count);
for (int i=0; i<size; i++){
printf("%f\t", stor[i]);
}
return 0;
}