I have a small kernel to compute two float parameters array. The result is same as CPU’s in Emulation,but all wrong when real GPU computing.
Should I normalize the input float array or add double type mid-result in computing? Who can help me? Thanks a lot!
below is the kernel code and host code.
[codebox]host code:
int mem_size = sizeof(Ray)*width*height;
float3* ray_idata;
CUDA_SAFE_CALL( cudaMalloc( (void**) &ray_idata, mem_size));
CUDA_SAFE_CALL( cudaMemcpy( ray_idata, RayArray, mem_size,cudaMemcpyHostToDevice) );
float4 *rfloat_odata;
mem_size = sizeof(float4)*width*height;
CUDA_SAFE_CALL( cudaMalloc( (void**) &rfloat_odata, mem_size));
CUDA_SAFE_CALL( cudaMemcpy( rfloat_odata, RetF, mem_size,cudaMemcpyHostToDevice) );
int3 *rint_odata;
mem_size = sizeof(int3)*width*height;
CUDA_SAFE_CALL( cudaMalloc( (void**) &rint_odata, mem_size));
CUDA_SAFE_CALL( cudaMemcpy( rint_odata, RetI, mem_size,cudaMemcpyHostToDevice) );
// setup execution parameters
dim3 grid( 16, 8, 1);
dim3 threads( 16,32, 1);
// execute the kernel
float3 bbmin1,bbmax1;
bbmin1.x=bbmin.x; bbmin1.y=bbmin.y; bbmin1.z=bbmin.z;
bbmax1.x=bbmax.x; bbmax1.y=bbmax.y; bbmax1.z=bbmax.z;
RayBBoxIkernel<<< grid, threads >>>( ray_idata,bbmin1.x,bbmin1.y,bbmin1.z,bbmax1.x,bbmax1.y,bbmax
1.z,rint_odata,rfloat_odata);
CUT_CHECK_ERROR("Kernel execution failed");
mem_size = sizeof(float4)*width*height;
CUDA_SAFE_CALL( cudaMemcpy( RetFG, rfloat_odata, mem_size,cudaMemcpyDeviceToHost) );
mem_size = sizeof(int3)*width*height;
CUDA_SAFE_CALL( cudaMemcpy( RetIG, rint_odata, mem_size,cudaMemcpyDeviceToHost) );
kernel code:
#define FLT_MAX 1E+37
#define FLT_MIN 1E-37
global void
RayBBoxIkernel( float3* r_idata,float bbminx,float bbminy,float bbminz,float bbmaxx,float bbmaxy,float bbmaxz,int3* rint_odata,float4* rfloat_odata)
{
// read in input data from global memory
// use the bank checker macro to check for bank conflicts during host
// emulation
int offsetCol=threadIdx.x+16*blockIdx.x;
int offsetRow=threadIdx.y+8*blockIdx.y;
int offset=offsetRow*256+offsetCol;
int3 *pRetI=rint_odata+offset; float4 *pRetF=rfloat_odata+offset;
float3 o,d,invd;
float scenemin = FLT_MIN, scenemax = FLT_MAX;
float t0 = scenemin, t1 = scenemax;
float tNear,tFar;
o=*(r_idata+offset*3);
d=*(r_idata+offset*3+1);
invd=*(r_idata+offset*3+2);
//o=r_idata[offset*3];
__syncthreads();
int cont=1;
if(cont &&(d.x!=0.0f)){
tNear = (bbminx - o.x) * invd.x;
tFar = (bbmaxx - o.x) * invd.x;
if (tNear > tFar) {float temp=tFar;tFar=tNear;tNear=temp;}
t0 = tNear > t0 ? tNear : t0;
t1 = tFar < t1 ? tFar : t1;
if (t0 > t1) cont=0;
}
if(cont &&(d.y!=0.0f)){
tNear = (bbminy - o.y) * invd.y;
tFar = (bbmaxy - o.y) * invd.y;
if (tNear > tFar) {float temp=tFar;tFar=tNear;tNear=temp;}
t0 = tNear > t0 ? tNear : t0;
t1 = tFar < t1 ? tFar : t1;
if (t0 > t1) cont=0;
}
if(cont &&(d.z !=0.0f)){
tNear = (bbminz - o.z) * invd.z;
tFar = (bbmaxz - o.z) * invd.z;
if (tNear > tFar) {float temp=tFar;tFar=tNear;tNear=temp;}
t0 = tNear > t0 ? tNear : t0;
t1 = tFar < t1 ? tFar : t1;
if (t0 > t1) cont=0;
}
__syncthreads();
// write data to global memory
rint_odata[offset].x=cont; rfloat_odata[offset].x=t0; rfloat_odata[offset].y=t1;
}
[/codebox]