Single precision floating point value error

The following program has both host and kernel code.
It uses the d_ImaginaryData, d_RealData, d_u, d_I, d_R are the floating point arrays.
when i run this code in the CPU i am getting the correct result.
when i run this in GPU with single thread in entire grid. i could not get the correct result.
all floating point values are single precision.
could any one can tell me the what problem exist in this code or why i am not able get the correct result in GPU.

// Host code

dim3 grid(1,1,1)
dim3 threads(1,1,1)
float *h_Real, *h_Imaginary, *h_u;

int size_Data = 2000 * rbins;
int mem_size_Data = sizeof(float) * size_Data;

// Allocate host memory
h_Real = (float*) malloc(mem_size_Data);
h_Imaginary =  (float*) malloc(mem_size_Data);
    h_u =  (float*) malloc(mem_size_Data);
    // data will be stored from the file into the h_real, h_Imaginary and h_u variable


   // allocate and copy data to the device memory
cutilSafeCall(cudaMalloc((void**) &d_u, mem_size_Image));
cutilSafeCall(cudaMemcpy(d_u, h_u, size_u, cudaMemcpyHostToDevice) );
cutilSafeCall(cudaMalloc((void**) &d_Real, mem_size_Data));
cutilSafeCall(cudaMalloc((void**) &d_Imaginary, mem_size_Data));
cutilSafeCall(cudaMemcpy(d_Real, h_Real, mem_size_Data, cudaMemcpyHostToDevice) );
cutilSafeCall(cudaMemcpy(d_Imaginary, h_Imaginary, mem_size_Data, cudaMemcpyHostToDevice) );

cutilSafeCall(cudaMalloc((void**) &d_R, mem_size_Image));
cutilSafeCall(cudaMalloc((void**) &d_I, mem_size_Image));

cutilSafeCall(cudaMemcpy(d_R, h_R, mem_size_Image, cudaMemcpyHostToDevice) );
cutilSafeCall(cudaMemcpy(d_I, h_I, mem_size_Image, cudaMemcpyHostToDevice) );
Kernel_Function<<< grid, threads>>>(rbins, X, Y, DX, DY, dt, d_PC_Real,d_PC_Imaginary, d_u, tm[1], d_I, d_R );

// GPU code
global void Kernel_Function(int rbins, float X, float Y, float DX, float DY, float dt,
float *d_RealData, float *d_ImaginaryData,
float *d_u, float tm,
float *d_I, float *d_R)

    int n_delay, i, j, k;
    float delay, Range, c;	
int tid, index;
    c = 3e8;    
 for(k=0; k < 2000; k++)

{

	for(j=0; j< 256; j++)

	{

		for( i= 0; i< 256; i++)

		{
			

			Range= sqrt( (X + i*DX)* (X + i*DX) + (Y - j*DY - d_u[k]) * (Y - j*DY - d_u[k]) );
			delay=2*(Range)/c;
			n_delay=floor((delay - tm)/dt)+1;

			
			index =  k * rbins + n_delay; 
			tid = j * 256  + i;

			if (n_delay>=1 && n_delay<= rbins)

			{

				d_I[tid] = d_I[tid] + d_ImaginaryData[index];

				d_R[tid] = d_R[tid] + d_RealData[index];

			}

		}

	}

}

}