I wrote a little CUDA program to test the computing precision with CPU computing result. giving them the same input data, but when the input size is 130560 or multiple of that number, the result is very strange. next is my code:
mytest.cu
// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <cutil.h>
// includes, kernels
#include <mytest_kernel.cu>
int iDivUp(int a, int b){
return ((a % b) != 0) ? (a / b + 1) : (a / b);
}
////////////////////////////////////////////////////////////////////////////////
// declaration, forward
void runTest( int argc, char** argv);
extern "C"
void computeGold( unsigned int *h_Data, unsigned int *h_Result, int dataN);
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int
main( int argc, char** argv)
{
runTest( argc, argv);
CUT_EXIT(argc, argv);
}
////////////////////////////////////////////////////////////////////////////////
//! Run a simple test for CUDA
////////////////////////////////////////////////////////////////////////////////
void
runTest( int argc, char** argv)
{
CUT_DEVICE_INIT();
int i;
const int dataN = 130560;
const int dataSize = dataN*sizeof(unsigned int);
const int BIN = 256;
const int BIN_Size = BIN*sizeof(unsigned int);
unsigned int *h_Data = (unsigned int*)malloc(dataSize);
unsigned int *h_Result = (unsigned int*)malloc(BIN_Size);
memset(h_Result,0,BIN*sizeof(unsigned int));
srand(2007);
for(i=0;i<dataN;i++)
h_Data[i] = rand() % 256;
for(i=0;i<dataN;i++)
{
unsigned int data1;
data1 = h_Data[i]&0xff;
h_Result[data1]++;
}
for(i=1;i<BIN;i++)
h_Result[i] += h_Result[i-1];
unsigned int *d_Result,*d_Data;
CUDA_SAFE_CALL( cudaMalloc((void **)&d_Data, dataSize ) );
CUDA_SAFE_CALL( cudaMalloc((void **)&d_Result, BIN_Size ) );
CUDA_SAFE_CALL( cudaMemcpy(d_Data, h_Data, dataSize, cudaMemcpyHostToDevice) );
CUDA_SAFE_CALL( cudaMemcpy(d_Result, h_Result, BIN_Size, cudaMemcpyHostToDevice) );
int blocks = iDivUp(dataN,256);
testKernel<<<blocks,256>>>(d_Data,d_Result,dataN);
CUT_CHECK_ERROR("testKernel execution failed.\n");
unsigned int *h_Data_d = (unsigned int*)malloc(dataSize);
CUDA_SAFE_CALL( cudaMemcpy(h_Data_d, d_Data, dataSize, cudaMemcpyDeviceToHost) );
computeGold(h_Data,h_Result,dataN);
int sum = 0;
for(i=0;i<dataN;i++)
{
printf("%d",h_Data[i]-h_Data_d[i]);
sum += h_Data[i]-h_Data_d[i];
}
printf("the different data Number is: %d\n",sum);
}
then the mytest_kernel.cu
#ifndef _MYTEST_KERNEL_H_
#define _MYTEST_KERNEL_H_
__global__ void
testKernel( unsigned int* d_Data, unsigned int* d_Result, int dataN)
{
const int globalTid = blockDim.x*blockIdx.x+threadIdx.x;
const int tid = threadIdx.x;
__shared__ float s_divide[256];
s_divide[tid] = (float)d_Result[tid]/(float)dataN;
__syncthreads();
if(globalTid<dataN)
{
unsigned int data = d_Data[globalTid]&0xff;
d_Data[globalTid] = s_divide[data]*255.0f;
}
__syncthreads();
}
and the mytest_gold.cpp
// export C interface
extern "C"
void computeGold( unsigned int *h_Data, unsigned int *h_Result, int dataN);
void
computeGold( unsigned int *h_Data, unsigned int *h_Result, int dataN)
{
float h_devide[256];
for(int i=0;i<256;i++)
h_devide[i] = (float)h_Result[i]/(float)dataN;
for(int i=0;i<dataN;i++)
{
unsigned int data = h_Data[i]&0xff;
h_Data[i] = h_devide[data]*255.0f;
}
}
at last, I check the output of the two different computing methods, but find that the result have some tiny different.
is that a bug of CUDA? or I have some mistakes?
thanks for any reply.
PuProject_BUG.rar (348 KB)