Hi,

I am new to CUDA programming and am trying to write a very simple GPU implementation of a function. It calculates a value for every thread and passes back to the global array f_out. I compare the result to the CPU case. I then calculate the sum of the f_out values. Now the sum of the values in GPU and CPU are different. I cannot seem to figure out the problem. Any help would be appreciated.

```
template <typename T>
__global__ void cuda_f(int N, int num_matches, T* pt_set1, T* pt_set2, int * indices, T* mat, T* trans, T* f_out)
{
//unsigned int tid = threadIdx.x;
unsigned int gid = blockIdx.x * blockDim.x + threadIdx.x;
int j = indices[gid];
if( j >= 0 && j < N && gid < N) {
// res = T * p1 - p2
T resX = (trans[0] * pt_set1[gid] + trans[1]*pt_set1[gid+N] + trans[2]*pt_set1[gid+2*N] + trans[3]) - pt_set2[j];
T resY = (trans[4] * pt_set1[gid] + trans[5]*pt_set1[gid+N] + trans[6]*pt_set1[gid+2*N] + trans[7]) - pt_set2[j+N];
T resZ = (trans[8] * pt_set1[gid] + trans[9]*pt_set1[gid+N] + trans[10]*pt_set1[gid+2*N] + trans[11]) - pt_set2[j+2*N];
// temp = M * res
T tempX = (mat[gid] * resX + mat[gid+N] * resY + mat[gid+2*N] * resZ);
T tempY = (mat[gid+3*N] * resX + mat[gid+4*N] * resY + mat[gid+5*N] * resZ);
T tempZ = (mat[gid+6*N] * resX + mat[gid+7*N] * resY + mat[gid+8*N] * resZ);
// temp_double = res' * temp
T temp_double = resX * tempX + resY * tempY + resZ * tempZ;
f_out[gid] = temp_double/(double)num_matches;
}
}
int f(std::vector<double> &t, void * params)
{
optData *opt = (optData *)params;
double *d_p1, *d_ p2, *d_M,*d_trans, *d_f_out;
int *d_ind;
//read in p1 points
checkCudaErrors(cudaMalloc((void **)&d_p1, 3*opt->N*sizeof(double)));
checkCudaErrors(cudaMemcpy(d_p1, opt->p1, 3*opt->N* sizeof(double), cudaMemcpyHostToDevice));
// read in p2 points
checkCudaErrors(cudaMalloc((void **)&d_p2, 3*opt->N*sizeof(double)));
checkCudaErrors(cudaMemcpy(d_p2, opt->p2, 3*opt->N* sizeof(double), cudaMemcpyHostToDevice));
// read in M matrix for each point
checkCudaErrors(cudaMalloc((void **)&d_M, 9*opt->N*sizeof(double)));
checkCudaErrors(cudaMemcpy(d_M, opt->M, 9*opt->N* sizeof(double), cudaMemcpyHostToDevice));
// read in nearest neighbor index
checkCudaErrors(cudaMalloc((void **)&d_ind, opt->N*sizeof(int)));
checkCudaErrors(cudaMemcpy(d_ind, opt->ind, opt->N*sizeof(int), cudaMemcpyHostToDevice));
// read in transformation
checkCudaErrors(cudaMalloc((void **)&d_trans, t.size()*sizeof(double)));
checkCudaErrors(cudaMemcpy(d_trans, &t[0], t.size() * sizeof(double), cudaMemcpyHostToDevice));
// allocate output
checkCudaErrors(cudaMalloc((void **)&d_f_out, opt->N*sizeof(double)));
int blockSize = 256;
int minGridSize;
int gridSize;
//cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, (void *)cuda_f, 0, gpu_opt_data->N);
gridSize = (opt->N + blockSize - 1) / blockSize;
// opt->N is not multiple of blockSize
cuda_f<double><<<gridSize,blockSize>>>(opt->N, opt->num_matches, d_p1,d_p2,d_ind, d_M, d_trans, d_f_out);
getLastCudaError("cuda_f kernel failed");
// summing up the elements
std::vector<double> f_out(gpu_opt_data->N);
checkCudaErrors(cudaMemcpy(&f_out[0], d_f_out, sizeof(double)*opt->N, cudaMemcpyDeviceToHost));
double f = 0.0;
for(int i=0; i < opt->N; i++){
//fout << f_out[i] << std::endl;
f += f_out[i];
}
}
```