Why the result so different? Result is same as CPU's in Emulation,but all wrong when GPU comput

I have a small kernel to compute two float parameters array. The result is same as CPU’s in Emulation,but all wrong when real GPU computing.

Should I normalize the input float array or add double type mid-result in computing? Who can help me? Thanks a lot!

below is the kernel code and host code.

[codebox]host code:

int mem_size = sizeof(Ray)*width*height;

float3* ray_idata;

CUDA_SAFE_CALL( cudaMalloc( (void**) &ray_idata, mem_size));

CUDA_SAFE_CALL( cudaMemcpy( ray_idata, RayArray, mem_size,cudaMemcpyHostToDevice) );

float4 *rfloat_odata;

mem_size = sizeof(float4)*width*height;

CUDA_SAFE_CALL( cudaMalloc( (void**) &rfloat_odata, mem_size));

CUDA_SAFE_CALL( cudaMemcpy( rfloat_odata, RetF, mem_size,cudaMemcpyHostToDevice) );

int3 *rint_odata;

mem_size = sizeof(int3)*width*height;

CUDA_SAFE_CALL( cudaMalloc( (void**) &rint_odata, mem_size));

CUDA_SAFE_CALL( cudaMemcpy( rint_odata, RetI, mem_size,cudaMemcpyHostToDevice) );

// setup execution parameters

dim3  grid( 16, 8, 1);

dim3  threads( 16,32, 1);

// execute the kernel

	float3 bbmin1,bbmax1;

	bbmin1.x=bbmin.x; bbmin1.y=bbmin.y; bbmin1.z=bbmin.z;

	bbmax1.x=bbmax.x; bbmax1.y=bbmax.y; bbmax1.z=bbmax.z;

	RayBBoxIkernel<<< grid, threads >>>( ray_idata,bbmin1.x,bbmin1.y,bbmin1.z,bbmax1.x,bbmax1.y,bbmax

1.z,rint_odata,rfloat_odata);

	CUT_CHECK_ERROR("Kernel execution failed");

	mem_size = sizeof(float4)*width*height;

	CUDA_SAFE_CALL( cudaMemcpy( RetFG, rfloat_odata, mem_size,cudaMemcpyDeviceToHost) );

	mem_size = sizeof(int3)*width*height;

	CUDA_SAFE_CALL( cudaMemcpy( RetIG, rint_odata, mem_size,cudaMemcpyDeviceToHost) );

kernel code:

#define FLT_MAX 1E+37

#define FLT_MIN 1E-37

global void

RayBBoxIkernel( float3* r_idata,float bbminx,float bbminy,float bbminz,float bbmaxx,float bbmaxy,float bbmaxz,int3* rint_odata,float4* rfloat_odata)

{

// read in input data from global memory

// use the bank checker macro to check for bank conflicts during host

// emulation

		int offsetCol=threadIdx.x+16*blockIdx.x; 

		int offsetRow=threadIdx.y+8*blockIdx.y; 

		int offset=offsetRow*256+offsetCol;

		int3 *pRetI=rint_odata+offset; float4 *pRetF=rfloat_odata+offset;

		float3 o,d,invd;

		float scenemin = FLT_MIN, scenemax = FLT_MAX;

		float t0 = scenemin, t1 = scenemax;

		float tNear,tFar; 

		o=*(r_idata+offset*3);

		d=*(r_idata+offset*3+1);

		invd=*(r_idata+offset*3+2);

		//o=r_idata[offset*3];

__syncthreads();

		int cont=1;

		if(cont &&(d.x!=0.0f)){				

			

			tNear = (bbminx - o.x) * invd.x;

			tFar  = (bbmaxx - o.x) * invd.x;

			if (tNear > tFar) {float temp=tFar;tFar=tNear;tNear=temp;}

			t0 = tNear > t0 ? tNear : t0;

			t1 = tFar  < t1 ? tFar	: t1;

			if (t0 > t1) cont=0;

		}

		if(cont &&(d.y!=0.0f)){

			tNear = (bbminy - o.y) * invd.y;

			tFar  = (bbmaxy - o.y) * invd.y;

			if (tNear > tFar) {float temp=tFar;tFar=tNear;tNear=temp;}

			t0 = tNear > t0 ? tNear : t0;

			t1 = tFar  < t1 ? tFar	: t1;

			if (t0 > t1) cont=0;

		}

		if(cont &&(d.z !=0.0f)){

			tNear = (bbminz - o.z) * invd.z;

			tFar  = (bbmaxz - o.z) * invd.z;

			if (tNear > tFar) {float temp=tFar;tFar=tNear;tNear=temp;}

			t0 = tNear > t0 ? tNear : t0;

			t1 = tFar  < t1 ? tFar	: t1;

			if (t0 > t1) cont=0;

		}

__syncthreads();

// write data to global memory

		rint_odata[offset].x=cont; rfloat_odata[offset].x=t0; rfloat_odata[offset].y=t1;

}

[/codebox]

[quote name=‘Jiao Liangbao’ date=‘Feb 21 2009, 07:56 PM’ post=‘509014’]

I have a small kernel to compute two float parameters array. The result is same as CPU’s in Emulation,but all wrong when real GPU computing.

Should I normalize the input float array or add double type mid-result in computing? Who can help me? Thanks a lot!

below is the kernel code and host code.

I think may be it is because of the float compute accuracy. But how can I solve it ? Need I use all by integer instead?