Problem with kernel output

I am new to CUDA programming and am trying to write a very simple GPU implementation of a function. It calculates a value for every thread and passes back to the global array f_out. I compare the result to the CPU case. I then calculate the sum of the f_out values. Now the sum of the values in GPU and CPU are different. I cannot seem to figure out the problem. Any help would be appreciated.

template <typename T>
        __global__ void cuda_f(int N, int num_matches, T* pt_set1, T* pt_set2, int * indices, T* mat, T* trans, T* f_out) 
            //unsigned int tid = threadIdx.x;
            unsigned int gid  = blockIdx.x * blockDim.x + threadIdx.x;
            int j = indices[gid];
            if( j >= 0 && j < N && gid < N) {
            // res = T * p1 - p2
            T resX = (trans[0] * pt_set1[gid] + trans[1]*pt_set1[gid+N] + trans[2]*pt_set1[gid+2*N] + trans[3]) - pt_set2[j];
            T resY = (trans[4] * pt_set1[gid] + trans[5]*pt_set1[gid+N] + trans[6]*pt_set1[gid+2*N] + trans[7]) - pt_set2[j+N];
            T resZ = (trans[8] * pt_set1[gid] + trans[9]*pt_set1[gid+N] + trans[10]*pt_set1[gid+2*N] + trans[11]) - pt_set2[j+2*N];
            // temp = M * res
            T tempX = (mat[gid] * resX + mat[gid+N] * resY + mat[gid+2*N] * resZ);
            T tempY = (mat[gid+3*N] * resX + mat[gid+4*N] * resY + mat[gid+5*N] * resZ);
            T tempZ = (mat[gid+6*N] * resX + mat[gid+7*N] * resY + mat[gid+8*N] * resZ);
            // temp_double = res' * temp
            T temp_double = resX * tempX + resY * tempY + resZ * tempZ;

            f_out[gid] = temp_double/(double)num_matches;
 int f(std::vector<double> &t, void * params) 
    optData *opt = (optData *)params;
    double *d_p1, *d_ p2, *d_M,*d_trans, *d_f_out;
    int *d_ind;
    //read in p1 points
    checkCudaErrors(cudaMalloc((void **)&d_p1, 3*opt->N*sizeof(double)));
    checkCudaErrors(cudaMemcpy(d_p1, opt->p1, 3*opt->N* sizeof(double), cudaMemcpyHostToDevice));
    // read in p2 points
    checkCudaErrors(cudaMalloc((void **)&d_p2, 3*opt->N*sizeof(double)));
    checkCudaErrors(cudaMemcpy(d_p2, opt->p2, 3*opt->N* sizeof(double), cudaMemcpyHostToDevice));
    // read in M matrix for each point
    checkCudaErrors(cudaMalloc((void **)&d_M, 9*opt->N*sizeof(double)));
    checkCudaErrors(cudaMemcpy(d_M, opt->M, 9*opt->N* sizeof(double), cudaMemcpyHostToDevice));
    // read in nearest neighbor index
    checkCudaErrors(cudaMalloc((void **)&d_ind, opt->N*sizeof(int)));
    checkCudaErrors(cudaMemcpy(d_ind, opt->ind, opt->N*sizeof(int), cudaMemcpyHostToDevice));
    // read in transformation
    checkCudaErrors(cudaMalloc((void **)&d_trans, t.size()*sizeof(double)));
    checkCudaErrors(cudaMemcpy(d_trans, &t[0], t.size() * sizeof(double), cudaMemcpyHostToDevice));
    // allocate output
    checkCudaErrors(cudaMalloc((void **)&d_f_out, opt->N*sizeof(double)));
    int blockSize = 256;
    int minGridSize;
    int gridSize;
    //cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, (void *)cuda_f, 0, gpu_opt_data->N); 
    gridSize = (opt->N + blockSize - 1) / blockSize;
    // opt->N is not multiple of blockSize
    cuda_f<double><<<gridSize,blockSize>>>(opt->N, opt->num_matches, d_p1,d_p2,d_ind, d_M, d_trans, d_f_out);
    getLastCudaError("cuda_f kernel failed");
    // summing up the elements
    std::vector<double> f_out(gpu_opt_data->N);
    checkCudaErrors(cudaMemcpy(&f_out[0], d_f_out, sizeof(double)*opt->N, cudaMemcpyDeviceToHost));
    double f = 0.0;
    for(int i=0; i < opt->N; i++){
        //fout << f_out[i] << std::endl;
        f += f_out[i];

How different is “different”. If the two sets of results look nothing at all like each other there is probably a bug in the code. If the two sets of results look similar, but have small numeric differences, it is likely that the GPU results are more accurate than the CPU results, making them slightly different.

In that case, the most common reason for that is the use of FMA (fused multiply-add) on the GPU, which reduced rounding errors and gives some protection against subtractive cancellation. To test this working hypothesis, compile the GPU code with -fmad=false. If that makes the differences disappear, the hypothesis was correct.

The CPU version provides the resulting sum as f = 151.731 while GPU f = 114.314. So I am sure that there is a bug. Problem is I have checked the same code running in CPU and have got the correct answer. As I have said before, I am new to CUDA.

My suggestion would be to make the problem size as small as possible, while still reproducing different results. Then add printf() in strategic places to find out where the data starts diverging. You can also use this opportunity to acquaint yourself with the CUDA debugger and follow the program’s execution in this way (for relatively simple codes I personally find the printf approach adequate). Use cuda-memcheck to see if there are any out-of-bounds accesses or race conditions in the code.

Looking at the kernel, does something obvious jump at you as being wrong? I find that between 80-90 size, the problem starts diverging. It still makes no sense to me.

To be perfectly honest, I derive no joy from debugging my own code, let alone other people’s code. I am always happy to provide advice on general debugging techniques, however.