Wrong output when i run the same code in the GPU with single thread

The following program has both host and kernel code.
It uses the d_ImaginaryData, d_RealData, d_u, d_I, d_R are the floating point arrays.
when i run this code in the CPU i am getting the correct result.
when i run this in GPU with single thread in entire grid. i could not get the correct result.
all floating point values are single precision.
could any one can tell me that what problem exist in this code or why i am not able get the correct result in GPU.

// Host code

dim3 grid(1,1,1)
dim3 threads(1,1,1)
float *h_Real, *h_Imaginary, *h_u, *h_R, *h_I;

int size_Data = 2000 * 1000;
int mem_size_Data = sizeof(float) * size_Data;
int mem_size_image = 256*256*sizeof(float);

// Allocate host memory
h_Real = (float*) malloc(mem_size_Data);
h_Imaginary =  (float*) malloc(mem_size_Data);
    h_u =  (float*) malloc(mem_size_Data);
    h_R = (float*) malloc(mem_size_image);
    h_I = (float*) malloc(mem_size_image); 
    // data will be stored from the file into the h_real, h_Imaginary and h_u variable
   
    memset(h_R, 0,  mem_size_Image);
memset(h_I, 0,  mem_size_Image);

   // allocate and copy data to the device memory
cutilSafeCall(cudaMalloc((void**) &d_u, mem_size_Image));
cutilSafeCall(cudaMemcpy(d_u, h_u, size_u, cudaMemcpyHostToDevice) );
cutilSafeCall(cudaMalloc((void**) &d_Real, mem_size_Data));
cutilSafeCall(cudaMalloc((void**) &d_Imaginary, mem_size_Data));
cutilSafeCall(cudaMemcpy(d_Real, h_Real, mem_size_Data, cudaMemcpyHostToDevice) );
cutilSafeCall(cudaMemcpy(d_Imaginary, h_Imaginary, mem_size_Data, cudaMemcpyHostToDevice) );

cutilSafeCall(cudaMalloc((void**) &d_R, mem_size_Image));
cutilSafeCall(cudaMalloc((void**) &d_I, mem_size_Image));

cutilSafeCall(cudaMemcpy(d_R, h_R, mem_size_Image, cudaMemcpyHostToDevice) );
cutilSafeCall(cudaMemcpy(d_I, h_I, mem_size_Image, cudaMemcpyHostToDevice) );
Kernel_Function<<< grid, threads>>>(rbins, X, Y, DX, DY, dt, d_PC_Real,d_PC_Imaginary, d_u, tm[1], d_I, d_R );
cutilSafeCall(cudaMemcpy(h_R, d_R, mem_size_Image, cudaMemcpyDeviceToHost) );
cutilSafeCall(cudaMemcpy(h_I, d_I, mem_size_Image, cudaMemcpyDeviceToHost) );									   

// GPU code
global void Kernel_Function(int rbins, float X, float Y, float DX, float DY, float dt,
float *d_RealData, float *d_ImaginaryData,
float *d_u, float tm,
float *d_I, float *d_R)

    int n_delay, i, j, k;
    float delay, Range, c;	
int tid, index;
    c = 3e8;    
 for(k=0; k < 2000; k++)

{

	for(j=0; j< 256; j++)

	{

		for( i= 0; i< 256; i++)

		{
			

			Range= sqrt( (X + i*DX)* (X + i*DX) + (Y - j*DY - d_u[k]) * (Y - j*DY - d_u[k]) );
			delay=2*(Range)/c;
			n_delay=floor((delay - tm)/dt)+1;

			
			index =  k * rbins + n_delay; 
			tid = j * 256  + i;

			if (n_delay>=1 && n_delay<= rbins)

			{

				d_I[tid] = d_I[tid] + d_ImaginaryData[index];

				d_R[tid] = d_R[tid] + d_RealData[index];

			}

		}

	}

}

}

Seeing as it is impossible to actually run the code you posted and there is no working CPU reference to compare to (and please use code boxes in future, that is really difficult to read as posted), could you describe what the “correct” result should be and what the GPU version produces and how that is “incorrect”. Otherwise I am not sure how you expect someone could help you.