Calling the following 2 kernels using evalFitness0<<<32,1>>>(R_GPU, M_GPU, 32) evalFitness2<<<32,32>>>(R_GPU, M_GPU, 32) gives me different results, though they really should be the same. (I’ve used small dimensions here as that is what I am testing with, but I actually run these kernels using dimensions of 32-512) Both kernels should be self explanatory. In the second kernel I’m trying to use as much parallelism as possible by using a parallel reduce.
Any thoughts on why this is happening?
__global__ void evalFitness0(float* R, float* M, int Nd){
float Mp;
int p = blockIdx.x; // Current Block
Mp = 0;
for(int i=0; i<Nd; i++){
Mp += (R[p*Nd+i]*R[p*Nd+i]);
}
M[p] = -Mp;
}
__global__ void evalFitness2(float* R, float* M, int Nd){
extern __shared__ float sdata[];
int p = blockIdx.x;
int i = threadIdx.x;
sdata[i] = (R[p*Nd+i]*R[p*Nd+i]);
__syncthreads();
for(int s=1; s < Nd; s*=2) {
if(i%(2*s) == 0 && i+s < Nd){
sdata[i] += sdata[i+s];
}
__syncthreads();
}
if (i == 0){
M[p] = -sdata[0];
}
}
Thanks.