Sequential loop inside kernel seems to be not working at all, or this is just a synchronization problem ?!

Hi. I’ve got a problem with the innermost sequential loop executed inside incKernel function. It’s intended to count a sum for each thread and print the output, but it seems to be not working. The only result I have from the code execution is that the value of sum_th[threadIdx.x] is not changed and is equal to zero for each thread. Can you point me at what is the problem with this loop, or this is just the lack of synchronizations. Thanks a lot. Waiting for your reply.

__device__ unsigned int sum_th[10] = { 0 };

__global__ void incKernel()
{
	__shared__ unsigned int counts[10];

	for (int i = 0; i < 10; i++)
		atomicAdd(&sum_th[threadIdx.x], counts[i]);

	__syncthreads();
	atomicCAS(&counts[threadIdx.x], 0, 1);

	printf("tid = %d count = %d\n", threadIdx.x, sum_th[threadIdx.x]);
}

int main()
{
	cudaError_t cudaStatus;
	cudaStatus = cudaSetDevice(0);
	if (cudaStatus != cudaSuccess) {
		fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
		_getch();
		return 1;
	}

	incKernel << <1, 10 >> >();

	cudaStatus = cudaDeviceSynchronize();
	if (cudaStatus != cudaSuccess) {
		fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
		_getch();
		return 1;
	}

    cudaStatus = cudaDeviceReset();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaDeviceReset failed!");
		_getch();
		return 1;
    }
	
	_getch();

    return 0;
}

You haven’t initialized your shared memory (counts) anywhere.

Your code doesn’t make sense to me.

O’key. Thanks for your reply.