The first piece of code:
__global__ void rgb2Gray(uchar * rgbData, int * grayData, int * scaleData, int totalSize)
{
size_t blockSize = blockDim.x * blockDim.y * blockDim.z;
size_t uniBlockInd = blockIdx.x + blockIdx.y * gridDim.x;
size_t threadOffset = threadIdx.x + threadIdx.y * blockDim.x
+ threadIdx.z * blockDim.x * blockDim.y;
size_t uniThreadInd = uniBlockInd * blockSize + threadOffset;
if (uniThreadInd / 3 < totalSize)
{
grayData[uniThreadInd / 3]++;
}
}
The second piece of code:
__global__ void rgb2Gray(uchar * rgbData, int * grayData, int * scaleData, int totalSize)
{
size_t blockSize = blockDim.x * blockDim.y * blockDim.z;
size_t uniBlockInd = blockIdx.x + blockIdx.y * gridDim.x;
size_t threadOffset = threadIdx.x + threadIdx.y * blockDim.x
+ threadIdx.z * blockDim.x * blockDim.y;
size_t uniThreadInd = uniBlockInd * blockSize + threadOffset;
if (uniThreadInd / 3 < totalSize && threadIdx.x == 1) {
grayData[uniThreadInd / 3] ++;
}
if (uniThreadInd / 3 < totalSize && threadIdx.x == 0) {
grayData[uniThreadInd / 3] ++;
}
if (uniThreadInd / 3 < totalSize && threadIdx.x == 2) {
grayData[uniThreadInd / 3] ++;
}
}
The main function is as shown below:
dim3 nThreadPerBlock(3, 15, 15);
dim3 nBlockPerGrid((rgbMat->cols + 14) / 15, (rgbMat->rows + 14) / 15, 1);
rgb2Gray << <nBlockPerGrid, nThreadPerBlock >> >(dev_rgbData, dev_grayData, dev_scaleData, rgbMat->rows * rgbMat->cols);
The result expected will show that all grayData equals 3, but only the second piece of code show me the right answer. Maybe the first code encounters some problem like data race? Can someone explain that? Thanks for your help.