Problems with simple histogram

Hi everyone, i’m new to cuda and i’m experiencing some strange behaviour with a simple histogram implementation.

I doublechecked everything but it seems that i missed some bugs.

The kernels look like follows:

[codebox]

// histogram defines

#define NUM_ELEM 10000

#define NUM_BINS 64

#define NUM_THREADS 112

#define NUM_ELEM_BLOCK (NUM_THREADS * 64)

global void

histKernel(int* data, int* hists)

{

const int baseData = blockIdx.x * NUM_ELEM_BLOCK;

const int blockEl = min(NUM_ELEM - baseData, NUM_ELEM_BLOCK);

// shared histogram stored in shared memory

__shared__ unsigned short sharedHist[NUM_THREADS][NUM_BINS];

// clear memory for thread

for (int bin = 0; bin < NUM_BINS; bin++) {

		sharedHist[threadIdx.x][bin] = 0;

}

// compute histogram per thread

for (int pos = threadIdx.x; pos < blockEl; pos += NUM_THREADS) {

	sharedHist[threadIdx.x][data[baseData+pos]]++;

}

// wait for all other threads

__syncthreads();

// aggregate results per block

const int baseHists = blockIdx.x * NUM_BINS;

if (threadIdx.x < NUM_BINS) {

	for (int i = 0; i < NUM_THREADS; i++) {

		hists[baseHists+threadIdx.x] += sharedHist[i][threadIdx.x];

	}

}

}

global void

mergeKernel(int* hists, int* hist)

{

// aggregate final result

for (int pos = threadIdx.x; pos < NUM_ELEM; pos += NUM_BINS) {

	hist[threadIdx.x] += hists[pos];

}

}

[/codebox]

The code gets invokated as follows:

[codebox]

    // allocate CPU memory

int* dataH = (int*) malloc( sizeof(int) * NUM_ELEM );

int* histH = (int*) malloc( sizeof(int) * NUM_BINS );

// generate NUM_ELEM random integers between 0 and NUM_BINS-1

srand(2009);

for (int i = 0; i < NUM_ELEM; ++i) {

	dataH[i] = rand() % NUM_BINS;

	// printf("%d\n", dataH[i]);

}

// compute number of required blocks

const int NUM_BLOCKS = (NUM_ELEM % NUM_ELEM_BLOCK == 0) ? 

	NUM_ELEM / NUM_ELEM_BLOCK : NUM_ELEM / NUM_ELEM_BLOCK + 1;

// allocate GPU memory

int *dataD, *histsD, *histD;

cudaMalloc( (void**) &dataD, sizeof(int) * NUM_ELEM );

cudaMalloc( (void**) &histsD, sizeof(int) * NUM_BINS * NUM_BLOCKS );

cudaMalloc( (void**) &histD, sizeof(int) * NUM_BINS );

cudaMemcpy( dataD, dataH, sizeof(int) * NUM_ELEM, cudaMemcpyHostToDevice );

cudaMemset( histsD, 0, sizeof(int) * NUM_BINS * NUM_BLOCKS );

cudaMemset( histD, 0, sizeof(int) * NUM_BINS );



// debug

printf("Calling histogram kernel using..\n");

printf("%d blocks with %d threads\n\n", NUM_BLOCKS, NUM_THREADS);

// call histogram kernel

histKernel<<<NUM_BLOCKS, NUM_THREADS>>>(dataD, histsD);

cudaThreadSynchronize();



// call merge kernel

mergeKernel<<<1, NUM_BINS>>>(histsD, histD);

cudaThreadSynchronize();

// copy final histogram to CPU memory

cudaMemcpy(histH, histD, sizeof(int) * NUM_BINS, cudaMemcpyDeviceToHost);



// display histogram

int sum = 0;

printf("Histogram data: \n");

for (int i = 0; i < NUM_BINS; i++) {

	printf("Bin %d: %d\n", i, histH[i]);

	sum += histH[i];

}

printf("Number of elements: %d", sum);

// free CPU and GPU memory

delete [] dataH;

delete [] histH;

cudaFree(dataD);

cudaFree(histsD);

cudaFree(histD);

[/codebox]

I’m experiencing the problem that for small NUM_ELEM ( < 100000 ) the sum is exactly twice NUM_ELEM, the counts look ok.

If i pick 1 million elements the results are totally wrong.

Does anyone see the problem? Thanks in advance!

I just found the very stupid bug.

In the mergeKernel the loop stopping condition should be (pos < NUM_BLOCKS*NUM_BINS) instead of (pos < NUM_ELEM)

thank you anyway =)