Hi,
Just wondering why this simple kernel, which should generate a vector of 49995000s, returns different values every time:
#include<stdio.h>
global void k1(float* datap, int n) {
int idx = blockIdx.x*blockDim.x + threadIdx.x;
if(idx >= n) return;
__syncthreads();
for(int i=0; i<n; i++) {
__syncthreads();
//atomicAdd(&datap[(idx+i)%n], i);
//datap[(idx+i)%n] += 1;
atomicAdd(&datap[(idx+i)%n], i);
}
}
int main() {
float* datap;
int n = 10000;
float* datap_h = new float[n];
cudaMalloc(&datap, nsizeof(float));
cudaMemset(datap, 0, n sizeof(float));
k1<<<(n+255)/256,256>>>(datap, n);
cudaMemcpy(datap_h, datap, n*sizeof(float), cudaMemcpyDefault);
printf(“%f %f %f\n”, datap_h[0], datap_h[1], datap_h[n-1]);
return 0;
}
The results seems very random to me:
jie@jiemayo0:~/Documents/code/cuda/mpitest$ ./a.out
49993416.000000 49993424.000000 49993432.000000
jie@jiemayo0:~/Documents/code/cuda/mpitest$ ./a.out
49993280.000000 49993256.000000 49993296.000000
jie@jiemayo0:~/Documents/code/cuda/mpitest$ ./a.out
49993192.000000 49993184.000000 49993192.000000
jie@jiemayo0:~/Documents/code/cuda/mpitest$ ./a.out
49993304.000000 49993296.000000 49993296.000000
I have tried this on both K80 and 1080Ti.
Best,
njuffa
December 27, 2018, 10:07pm
2
I find no joy in debugging other people’s code, so consider the following as hints rather than conclusive evidence of a particular root cause for your observations.
Have you checked for potential bugs by running under control of cuda-memcheck? If that comes up clean you are likely experiencing an instance of the following:
Floating-point addition is not associative like mathematical addition. Use of atomic adds will result in a unspecified order of additions.
tera
December 27, 2018, 10:13pm
3
jieshan1987:
Hi,
Just wondering why this simple kernel, which should generate a vector of 49995000s, returns different values every time:
#include<stdio.h>
__global__ void k1(float* datap, int n) {
int idx = blockIdx.x*blockDim.x + threadIdx.x;
if(idx >= n) return;
__syncthreads();
for(int i=0; i<n; i++) {
__syncthreads();
//atomicAdd(&datap[(idx+i)%n], i);
//datap[(idx+i)%n] += 1;
atomicAdd(&datap[(idx+i)%n], i);
}
}
int main() {
float* datap;
int n = 10000;
float* datap_h = new float[n];
cudaMalloc(&datap, n*sizeof(float));
cudaMemset(datap, 0, n*sizeof(float));
k1<<<(n+255)/256,256>>>(datap, n);
cudaMemcpy(datap_h, datap, n*sizeof(float), cudaMemcpyDefault);
printf("%f %f %f\n", datap_h[0], datap_h[1], datap_h[n-1]);
return 0;
}
The results seems very random to me:
jie@jiemayo0:~/Documents/code/cuda/mpitest$ ./a.out
49993416.000000 49993424.000000 49993432.000000
jie@jiemayo0:~/Documents/code/cuda/mpitest$ ./a.out
49993280.000000 49993256.000000 49993296.000000
jie@jiemayo0:~/Documents/code/cuda/mpitest$ ./a.out
49993192.000000 49993184.000000 49993192.000000
jie@jiemayo0:~/Documents/code/cuda/mpitest$ ./a.out
49993304.000000 49993296.000000 49993296.000000
I have tried this on both K80 and 1080Ti.
Best,
cuda
cuda
Edit: once again njuffa was quicker…
Hi njuffa and tera,
Thank you so much for the insightful and helpful response.
I have checked with cuda-memcheck and avoided using __syncthreads() to rule out the causes.
And the real cause is that I neglected the precision limit of single-precision floating point number. My bad.
Thank you so much!
Best