The bug happen when i try my code that works with CUDA 2.0 but crashed when i upgrade CUDA to 2.1. I try to reproduce the error, but it is still lengthy code, I try to make it sorter then the bug disappeared, sorry about that. I think the bug is related to CUDA 2.1 compiler issue
The problem is the two functions createIndex_kernel are supposed to produce the same result with d_cell, as they are only different with the g_dmass (commented part) that is nothing related to the computation of d_cell. However
the result are different : the first functions give the wrong maximum value 128 is the initial value of d_cell, while the second give the right answer.
Maximum index with first function 128 Allowed value 64
Maximum index with second function 64 Allowed value 64
My system is Ubuntu 8.04 hardy
Kernel Linux 2.6.24-23-generic
Memory 5.8G
Opteron Processor 275 AMD
Quadro FX 5600
NVIDIA-Driver 180.22
CUDA 2.1 final
void runTest( int argc, char** argv);
void test(uint w, uint h, int l);
int
main( int argc, char** argv)
{
runTest(argc, argv);
CUT_EXIT(argc, argv);
}
void
runTest( int argc, char** argv)
{
int w = 4;
int h = 4;
int l = 4;
CUT_DEVICE_INIT(argc, argv);
cutGetCmdLineArgumenti( argc, (const char**) argv, "w", &w);
cutGetCmdLineArgumenti( argc, (const char**) argv, "h", &h);
cutGetCmdLineArgumenti( argc, (const char**) argv, "l", &l);
test(w, h, l);
}
__host__ __device__ int isInside(const int& x,const int& y,const int& z,
const int& w,const int& h,const int& l){
return (x >= 0) && (x < w) && (y >= 0) && (y < h) && (z >= 0) && (z < l);
}
__global__ void createIndex_kernel(uint4 * g_cell, float4* g_dmass,
float* g_data,
float* vx, float* vy, float* vz,
int w, int h, int l
){
const uint nElems = w * h * l;
const uint wh = w * h;
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
if (i < w && j < h){
uint id = i + j * w;
for (int k=0; k < l; ++k, id+=wh) {
float mass = g_data[id];
float x = i + vx[id];
float y = j + vy[id];
float z = k + vz[id];
int xInt = int(x);
int yInt = int(y);
int zInt = int(z);
float dx = 1.f - (x - xInt);
float dy = 1.f - (y - yInt);
float dz = 1.f - (z - zInt);
int node_id = xInt + yInt * w + zInt * w * h;
float4 dist;
dist = make_float4(mass * dx * dy * dz,
mass * (1.f-dx) * dy * dz,
mass * dx * (1.f - dy) * dz,
mass * (1.f -dx) * (1.f - dy) * dz);
uint4 cell;
int cond;
cond = isInside(xInt, yInt, zInt, w, h, l);
cell.x = node_id * cond + nElems * (1 - cond);
cond = isInside(xInt + 1, yInt, zInt, w, h, l);
cell.y = (node_id + 1)* cond + nElems * (1 - cond);
cond = isInside(xInt, yInt+1, zInt, w, h, l);
cell.z = (node_id + w) * cond + nElems * (1 - cond);
cond = isInside(xInt+1, yInt+1, zInt, w, h, l);
cell.w = (node_id + w + 1) * cond;
g_cell[id] = cell;
g_dmass[id]= dist;
node_id += w*h;
dist = make_float4(mass * dx * dy * (1-dz),
mass * (1.f-dx) * dy * (1-dz),
mass * dx * (1.f - dy) * (1-dz),
mass * (1.f -dx) * (1.f - dy) * (1-dz));
cond = isInside(xInt, yInt, zInt+1, w, h, l);
cell.x = node_id * cond + nElems * (1 - cond);
cond = isInside(xInt + 1, yInt, zInt+1, w, h, l);
cell.y = (node_id + 1)* cond + nElems * (1 - cond);
cond = isInside(xInt, yInt+1, zInt+1, w, h, l);
cell.z = (node_id + w) * cond + nElems * (1 - cond);
cond = isInside(xInt+1, yInt+1, zInt+1, w, h, l);
cell.w = (node_id + w + 1) * cond;
g_cell[id + nElems] = cell;
g_dmass[id + nElems]= dist;
}
}
}
__global__ void createIndex_kernel2(uint4 * g_cell,
float* vx, float* vy, float* vz,
int w, int h, int l
){
const uint nElems = w * h * l;
const uint wh = w * h;
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
if (i < w && j < h){
uint id = i + j * w;
for (int k=0; k < l; ++k, id+=wh) {
// float mass = g_data[id];
float x = i + vx[id];
float y = j + vy[id];
float z = k + vz[id];
int xInt = int(x);
int yInt = int(y);
int zInt = int(z);
float dx = 1.f - (x - xInt);
float dy = 1.f - (y - yInt);
float dz = 1.f - (z - zInt);
int node_id = xInt + yInt * w + zInt * w * h;
float4 dist;
// dist = make_float4(mass * dx * dy * dz,
// mass * (1.f-dx) * dy * dz,
// mass * dx * (1.f - dy) * dz,
// mass * (1.f -dx) * (1.f - dy) * dz);
uint4 cell;
int cond;
cond = isInside(xInt, yInt, zInt, w, h, l);
cell.x = node_id * cond + nElems * (1 - cond);
cond = isInside(xInt + 1, yInt, zInt, w, h, l);
cell.y = (node_id + 1)* cond + nElems * (1 - cond);
cond = isInside(xInt, yInt+1, zInt, w, h, l);
cell.z = (node_id + w) * cond + nElems * (1 - cond);
cond = isInside(xInt+1, yInt+1, zInt, w, h, l);
cell.w = (node_id + w + 1) * cond;
g_cell[id] = cell;
// g_dmass[id]= dist;
node_id += w*h;
// dist = make_float4(mass * dx * dy * (1-dz),
// mass * (1.f-dx) * dy * (1-dz),
// mass * dx * (1.f - dy) * (1-dz),
// mass * (1.f -dx) * (1.f - dy) * (1-dz));
cond = isInside(xInt, yInt, zInt+1, w, h, l);
cell.x = node_id * cond + nElems * (1 - cond);
cond = isInside(xInt + 1, yInt, zInt+1, w, h, l);
cell.y = (node_id + 1)* cond + nElems * (1 - cond);
cond = isInside(xInt, yInt+1, zInt+1, w, h, l);
cell.z = (node_id + w) * cond + nElems * (1 - cond);
cond = isInside(xInt+1, yInt+1, zInt+1, w, h, l);
cell.w = (node_id + w + 1) * cond;
g_cell[id + nElems] = cell;
// g_dmass[id + nElems]= dist;
}
}
}
template<class T>
__global__ void cudaSetMem_kernel(T* g_data, T c, int len){
unsigned int id = threadIdx.x + blockDim.x * blockIdx.x + blockIdx.y * blockDim.x * gridDim.x;
if (id < len)
g_data[id] = c;
}
template<class T>
void cudaSetMem(T* d_data, T c, int len) {
dim3 threads(256);
dim3 grids((len + 255)>>8);
cudaSetMem_kernel<T><<<grids, threads>>>(d_data, c, len);
}
void test(uint w, uint h, int l){
float *h_iImg, *h_oImg, *h_vx, *h_vy, *h_vz;
int size = w * h * l;
// Generate test on host
h_iImg = new float ;
h_vx = new float ;
h_vy = new float ;
h_vz = new float ;
for (int i=0; i<size; ++i)
h_iImg[i] = rand() % 256;
for (int i=0; i< size; ++i)
h_vx[i] = (float(rand()) / RAND_MAX - 0.5f) * 2.f;
for (int i=0; i< size; ++i)
h_vy[i] = (float(rand()) / RAND_MAX - 0.5f) * 2.f;
for (int i=0; i< size; ++i)
h_vz[i] = (float(rand()) / RAND_MAX - 0.5f) * 2.f;
// Copy to device
float *d_iImg, *d_vx, *d_vy, *d_vz;
cudaMalloc((void**)&d_iImg, size * sizeof(float));
cudaMalloc((void**)&d_vx, size * sizeof(float));
cudaMalloc((void**)&d_vy, size * sizeof(float));
cudaMalloc((void**)&d_vz, size * sizeof(float));
cudaMemcpy(d_iImg, h_iImg, sizeof(float) * size, cudaMemcpyHostToDevice);
cudaMemcpy(d_vx, h_vx, sizeof(float) * size, cudaMemcpyHostToDevice);
cudaMemcpy(d_vy, h_vy, sizeof(float) * size, cudaMemcpyHostToDevice);
cudaMemcpy(d_vz, h_vz, sizeof(float) * size, cudaMemcpyHostToDevice);
uint4* d_cell;
float4* d_mass;
cudaMalloc((void**)&d_cell, size * sizeof(uint4) *2);
cudaMalloc((void**)&d_mass, size * sizeof(float4) *2);
cudaSetMem(d_cell, make_uint4(2*size, 2*size, 2*size, 2*size), 2 * size);
dim3 threads(16,16);
dim3 grids(w / 16 + 1, h/16 + 1);
createIndex_kernel<<<grids, threads>>>(d_cell, d_mass, d_iImg, d_vx, d_vy, d_vz, w, h, l);
uint* h_cell = new uint [4 * 2 * size];
cudaMemcpy(h_cell, d_cell, sizeof(uint4) * 2 * size, cudaMemcpyDeviceToHost);
int maxV=0;
for (int i=0; i< 8 * size; ++i)
maxV = max(h_cell[i], maxV);
fprintf(stderr, "Maximum index with first function %d Allowed value %d \n", maxV, size);
createIndex_kernel2<<<grids, threads>>>(d_cell, d_vx, d_vy, d_vz, w, h, l);
cudaMemcpy(h_cell, d_cell, sizeof(uint4) * 2 * size, cudaMemcpyDeviceToHost);
maxV = 0;
for (int i=0; i< 8 * size; ++i)
maxV = max(h_cell[i], maxV);
fprintf(stderr, "Maximum index with second function %d Allowed value %d\n", maxV, size);
}