Hi,
I have written a small kernel for learning purposes, which calculates a histogram.
#include "histogram_common.h"
#include <cuda.h>
#include <cutil.h>
#define BINCOUNT 256
#define THREADS_PER_BLOCK 128
#define NUM_BLOCKS(x) (((x)/THREADS_PER_BLOCK)+1)
__global__ void calculatehistogram(unsigned char* d_imagedata, unsigned int imagedatasize, unsigned int* d_histogram)
{
unsigned int currentpixel;
currentpixel = blockIdx.x*blockDim.x + threadIdx.x;
if (currentpixel < imagedatasize)
{
d_histogram[d_imagedata[currentpixel]]++;
}
}
extern "C" void gethistogram(unsigned char* h_imagedata, unsigned int imagedatasize, unsigned int* h_histogram)
{
//Devicepointers holding our data
unsigned char* d_imagedata;
unsigned int* d_histogram;
//Allocate memory on device and copy imagedata to device
CUDA_SAFE_CALL(cudaMalloc((void**) &d_imagedata, imagedatasize));
CUDA_SAFE_CALL(cudaMemcpy(d_imagedata, h_imagedata, imagedatasize, cudaMemcpyHostToDevice));
//Hostpointer holding our histogram
CUDA_SAFE_CALL(cudaMalloc((void**) &d_histogram, BINCOUNT * sizeof(unsigned int)));
CUDA_SAFE_CALL(cudaMemset(d_histogram, 0, BINCOUNT * sizeof(unsigned int)));
//Call the kernel
calculatehistogram<<<NUM_BLOCKS(imagedatasize), THREADS_PER_BLOCK>>>(d_imagedata, imagedatasize, d_histogram);
//Copy histogram back to host
CUDA_SAFE_CALL(cudaMemcpy(h_histogram, d_histogram, BINCOUNT * sizeof(unsigned int), cudaMemcpyDeviceToHost));
//Free Memory
CUDA_SAFE_CALL(cudaFree(d_imagedata));
CUDA_SAFE_CALL(cudaFree(d_histogram));
}
The kernel works fine with device emulation, giving the desired result. But when I start the kernel on the device, I get the following output:
...
Pixels with value 26: 1
Pixels with value 27: 1
Pixels with value 28: 1
Pixels with value 29: 1
Pixels with value 30: 1
Pixels with value 31: 1
Pixels with value 32: 1
Pixels with value 33: 1
Pixels with value 34: 1
Pixels with value 35: 1
Pixels with value 36: 1
Pixels with value 37: 1
Pixels with value 38: 1
Pixels with value 39: 1
Pixels with value 40: 1
Pixels with value 41: 1
Pixels with value 42: 1
Pixels with value 43: 1
Pixels with value 44: 1
Pixels with value 45: 1
Pixels with value 46: 1
Pixels with value 47: 1
Pixels with value 48: 1
Pixels with value 49: 1
Pixels with value 50: 1
Pixels with value 51: 1
Pixels with value 52: 1
Pixels with value 53: 1
Pixels with value 54: 1
Pixels with value 55: 1
Pixels with value 56: 1
Pixels with value 57: 1
Pixels with value 58: 1
Pixels with value 59: 1
Pixels with value 60: 1
Pixels with value 61: 1
Pixels with value 62: 1
Pixels with value 63: 1
Pixels with value 64: 1
Pixels with value 65: 1
Pixels with value 66: 1
Pixels with value 67: 1
Pixels with value 68: 1
Pixels with value 69: 1
Pixels with value 70: 1
Pixels with value 71: 1
Pixels with value 72: 1
Pixels with value 73: 1
Pixels with value 74: 1
Pixels with value 75: 1
Pixels with value 76: 1
Pixels with value 77: 1
Pixels with value 78: 1
Pixels with value 79: 1
Pixels with value 80: 1
Pixels with value 81: 1
Pixels with value 82: 1
Pixels with value 83: 1
Pixels with value 84: 1
Pixels with value 85: 1
Pixels with value 86: 1
Pixels with value 87: 1
Pixels with value 88: 1
Pixels with value 89: 1
Pixels with value 90: 1
Pixels with value 91: 1
Pixels with value 92: 1
Pixels with value 93: 1
Pixels with value 94: 1
Pixels with value 95: 1
...
Some bins even evaluate to 0 or 2 External Image
I have tried to debug this for hours, but I still have no idea what is causing this behaviour.
Thank you in advance for your help,
Kwyjibo
PS.: Windows 7 64 Bit, CUDA 3.0, Driver 197.16, GeForce G210M, Compute Capability 1.2