Counting values --- what's wrong? CUDA first timer's naive code~ help please!

[font=“Courier New”]In the following simple code, counting cases in device gives me a strange output. Can anybody let me know the reason why and how to fix it?

==============
CODE:

#include <cuda.h>
#include <stdio.h>

global void count(int* zd, int* cntd, int N)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int k;
if( idx < N)
{
k = zd[idx];
++cntd[k];
}
}

int main( int argc, char** argv)
{
int i;
int ntypes = 20;
int N = 1000000;
int blocksize = 512;
int nblocks = N / blocksize + (N % blocksize == 0? 0 : 1);

int* zd;
int* zh;
int* cntd;
int* cnth;

zh = (int *) malloc( N * sizeof(int));
cnth = (int *) malloc( ntypes * sizeof(int));

// prepare data
srand((unsigned)(time(0)));
for(i=0; i< N; i++)
{
	zh[i] = (int) (ntypes * rand()/(RAND_MAX+1.0));
}

// set counter to zero
for(i=0;i<ntypes;i++) cnth[i] = 0;

// **************************  count values in HOST 
for(i=0; i<N; i++) ++cnth[zh[i]];
int totalcnt=0;
for(i=0; i<ntypes; i++) {
	totalcnt +=cnth[i];
	printf("%d, %d, %d\n", i, cnth[i], totalcnt);
}

// **************************  count values in DEVICE 
cudaMalloc((void **) &zd, N * sizeof(int));
cudaMalloc((void **) &cntd, ntypes * sizeof(int));

for(i=0;i<ntypes;i++) cnth[i] = 0;

cudaMemcpy(zd, zh, N * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(cntd, cnth, ntypes * sizeof(int), cudaMemcpyHostToDevice);
	
count <<< nblocks, blocksize >>> (zd, cntd, N);

cudaThreadSynchronize();
	
cudaMemcpy(cnth, cntd, ntypes * sizeof(int), cudaMemcpyDeviceToHost);

// print
totalcnt=0;
for(i=0; i<ntypes; i++) {
	totalcnt +=cnth[i];
	printf("%d, %d, %d\n", i, cnth[i], totalcnt);
}

free(zh); free(cnth); 
cudaFree(zd); cudaFree(cntd);

return 1;

}

OUTPUT:

0, 50134, 50134
1, 50049, 100183
2, 49539, 149722
3, 50071, 199793
4, 50013, 249806
5, 49470, 299276
6, 49993, 349269
7, 50072, 399341
8, 50206, 449547
9, 49641, 499188
10, 49879, 549067
11, 50068, 599135
12, 50122, 649257
13, 49698, 698955
14, 49921, 748876
15, 50153, 799029
16, 50341, 849370
17, 50334, 899704
18, 50471, 950175
19, 49825, 1000000
0, 89, 89
1, 94, 183
2, 87, 270
3, 89, 359
4, 87, 446
5, 88, 534
6, 91, 625
7, 86, 711
8, 89, 800
9, 88, 888
10, 89, 977
11, 85, 1062
12, 91, 1153
13, 84, 1237
14, 89, 1326
15, 91, 1417
16, 88, 1505
17, 90, 1595
18, 93, 1688
19, 88, 1776
==============[/font]

__global__ void count(int* zd, int* cntd, int N) {

  int idx = blockIdx.x * blockDim.x + threadIdx.x;

  int k;

  if( idx < N) {

	k = zd[idx];

	++cntd[k];

  }

}

In your kernel you have a race condition when multiple threads attempt to update the same cntd[k]. If two are attempting to increment the same value at the same time, the result is not guaranteed to be correct.

One option would be to use atomicAdd() which does guarantee atomic updates. In your application this will be slow.

The better option, since cntd is small, would be for each block to accumulate a partial histogram in shared memory and write it out to device memory. Then run a second kernel to add the partial histograms together.

Thank you very much, Jamie! That’s exactly what I wanted to hear about. I will explore futher. This CUDA stuff is making me very engaged. ~~JC.