Hi everyone, i’m new to cuda and i’m experiencing some strange behaviour with a simple histogram implementation.
I doublechecked everything but it seems that i missed some bugs.
The kernels look like follows:
[codebox]
// histogram defines
#define NUM_ELEM 10000
#define NUM_BINS 64
#define NUM_THREADS 112
#define NUM_ELEM_BLOCK (NUM_THREADS * 64)
global void
histKernel(int* data, int* hists)
{
const int baseData = blockIdx.x * NUM_ELEM_BLOCK;
const int blockEl = min(NUM_ELEM - baseData, NUM_ELEM_BLOCK);
// shared histogram stored in shared memory
__shared__ unsigned short sharedHist[NUM_THREADS][NUM_BINS];
// clear memory for thread
for (int bin = 0; bin < NUM_BINS; bin++) {
sharedHist[threadIdx.x][bin] = 0;
}
// compute histogram per thread
for (int pos = threadIdx.x; pos < blockEl; pos += NUM_THREADS) {
sharedHist[threadIdx.x][data[baseData+pos]]++;
}
// wait for all other threads
__syncthreads();
// aggregate results per block
const int baseHists = blockIdx.x * NUM_BINS;
if (threadIdx.x < NUM_BINS) {
for (int i = 0; i < NUM_THREADS; i++) {
hists[baseHists+threadIdx.x] += sharedHist[i][threadIdx.x];
}
}
}
global void
mergeKernel(int* hists, int* hist)
{
// aggregate final result
for (int pos = threadIdx.x; pos < NUM_ELEM; pos += NUM_BINS) {
hist[threadIdx.x] += hists[pos];
}
}
[/codebox]
The code gets invokated as follows:
[codebox]
// allocate CPU memory
int* dataH = (int*) malloc( sizeof(int) * NUM_ELEM );
int* histH = (int*) malloc( sizeof(int) * NUM_BINS );
// generate NUM_ELEM random integers between 0 and NUM_BINS-1
srand(2009);
for (int i = 0; i < NUM_ELEM; ++i) {
dataH[i] = rand() % NUM_BINS;
// printf("%d\n", dataH[i]);
}
// compute number of required blocks
const int NUM_BLOCKS = (NUM_ELEM % NUM_ELEM_BLOCK == 0) ?
NUM_ELEM / NUM_ELEM_BLOCK : NUM_ELEM / NUM_ELEM_BLOCK + 1;
// allocate GPU memory
int *dataD, *histsD, *histD;
cudaMalloc( (void**) &dataD, sizeof(int) * NUM_ELEM );
cudaMalloc( (void**) &histsD, sizeof(int) * NUM_BINS * NUM_BLOCKS );
cudaMalloc( (void**) &histD, sizeof(int) * NUM_BINS );
cudaMemcpy( dataD, dataH, sizeof(int) * NUM_ELEM, cudaMemcpyHostToDevice );
cudaMemset( histsD, 0, sizeof(int) * NUM_BINS * NUM_BLOCKS );
cudaMemset( histD, 0, sizeof(int) * NUM_BINS );
// debug
printf("Calling histogram kernel using..\n");
printf("%d blocks with %d threads\n\n", NUM_BLOCKS, NUM_THREADS);
// call histogram kernel
histKernel<<<NUM_BLOCKS, NUM_THREADS>>>(dataD, histsD);
cudaThreadSynchronize();
// call merge kernel
mergeKernel<<<1, NUM_BINS>>>(histsD, histD);
cudaThreadSynchronize();
// copy final histogram to CPU memory
cudaMemcpy(histH, histD, sizeof(int) * NUM_BINS, cudaMemcpyDeviceToHost);
// display histogram
int sum = 0;
printf("Histogram data: \n");
for (int i = 0; i < NUM_BINS; i++) {
printf("Bin %d: %d\n", i, histH[i]);
sum += histH[i];
}
printf("Number of elements: %d", sum);
// free CPU and GPU memory
delete [] dataH;
delete [] histH;
cudaFree(dataD);
cudaFree(histsD);
cudaFree(histD);
[/codebox]
I’m experiencing the problem that for small NUM_ELEM ( < 100000 ) the sum is exactly twice NUM_ELEM, the counts look ok.
If i pick 1 million elements the results are totally wrong.
Does anyone see the problem? Thanks in advance!