I have been trying floating point subtraction using CUDA. The code worked fine when I emulated it but it isn’t working on the device. The problem that I am having is that the difference is coming out to be 0 which is really strange. Please help! Here is the code

[codebox]#include “stdafx.h”

#include <stdio.h>

#include <stdlib.h>

#include “cutil.h”

extern “C” double *zeros(int);

extern int MODE;

**global** void d_Idifference(double *d_M, double *d_S, double *d_Idiff, int n)

{

```
int x,y,idx;
x = blockIdx.x*blockDim.x + threadIdx.x;
y = blockIdx.y*blockDim.y + threadIdx.y;
idx = n*y + x;
if(x<n && y<n)
d_Idiff[idx] = d_M[idx] - d_S[idx];
```

}

extern “C” double *Idifference(double* M, double *S, int height, int width, double *timer){

```
double *Idiff;
int n = width;
unsigned int timer1 = 0;
Idiff = zeros(width);
cutCreateTimer(&timer1);
cutStartTimer(timer1);
if (MODE){
double *d_Idiff, *d_M, *d_S;
dim3 dimBlock(16,16);
dim3 dimGrid(height/dimBlock.x,width/dimBlock.y);
cudaMalloc((void**)&d_Idiff, n*n*sizeof(double));
cudaMalloc((void**)&d_M, n*n*sizeof(double));
cudaMalloc((void**)&d_S, n*n*sizeof(double));
cudaMemcpy(d_S, S, n*n*sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(d_M, M, n*n*sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(d_Idiff, Idiff, n*n*sizeof(double), cudaMemcpyHostToDevice);
d_Idifference<<<dimGrid,dimBlock>>>(d_M,d_S,d_Idiff,n);
cudaThreadSynchronize();
cudaMemcpy(Idiff, d_Idiff, n*n*sizeof(double), cudaMemcpyDeviceToHost);
printf("%f %f %f ", S[51*n + 134], M[51*n + 134], Idiff[51*n + 134]);
cutStopTimer(timer1);
*timer += cutGetTimerValue(timer1);
cudaFree(d_M);
cudaFree(d_S);
cudaFree(d_Idiff);
}
else{
int x,y;
for(y = 0; y < height; y++){
for(x = 0; x < width; x++)
Idiff[n*y + x] = M[n*y + x] - S[n*y + x];
}
cutStopTimer(timer1);
*timer += cutGetTimerValue(timer1);
}
return Idiff;
```

}[/codebox]

This is in fact a portion of a bigger program. So all of it may not make much sense to everyone. But my problem is essentially this: when I do it in emulation mode, the printf gives the right result. But when I do it in non-emulation mode, Idiff[…] gives 0.00 regardless of the indices I feed to it. Please help!