Problem:
Now I met a very simple but strange error. I try the double type multiply in cuda, when the variable come from register(test0) and global memory(test1), their result are different. But when I multipy x4. (x4 = 1.0), their result become the same(test2 and test3). And when I multiply 1.0 directly(test4 and test5), their result are different again!!
Code:
#include <stdio.h>
_global_
void compute_on_gpu(double *a){
double x0 = a[1];
double x1 = a[6];
double x2 = a[2];
double x3 = a[5];
double x4 = a[15];
if(threadIdx.x == 0){
printf("test0: %.16f\n", x0*x1 - x2*x3);
printf("test1: %.16lf\n",a[1]*a[6]- a[2]*a[5]);
printf("test2: %.16f\n", x0*x1*x4 - x2*x3*x4);
printf("test3: %.16lf\n",a[1]*a[6]*a[15] - a[2]*a[5]*a[15]);
printf("test4: %.16f\n", x0*x1 - x2*x3);
printf("test5: %.16lf\n",a[1]*a[6]*1.0 - a[2]*a[5]*1.0);
}
}
int main(){
double a[16];
double *d_a;
cudaMalloc((void **)&d_a, sizeof(double)*16);
a[0] = 20.9952629726873141;
a[1] = 35187.4264233882131521;
a[2] = -95746.0777556623070268;
a[3] = 0.0;
a[4] = 72.2689035191256579;
a[5] = 120868.6083184806921054;
a[6] = -329572.6309625260764733;
a[7] = 0.0;
a[8] = -834.3825760590481195;
a[9] = 0.0000000000000000;
a[10] = -0.0242135821054153;
a[11] = 0.0;
a[12] = 0.0;
a[13] = 0.0;
a[14] = 0.0;
a[15] = 1.0;
cudaMemcpy(d_a,a,sizeof(double)*16,cudaMemcpyHostToDevice);
compute_on_gpu<<<1,32>>>(d_a);
cudaDeviceReset();
return 0;
}
Result
test0: -24117532.8764190673828125
test1: -24117532.8764189481735229
test2: -24117532.8764190673828125
test3: -24117532.8764190673828125
test4: -24117532.8764190673828125
test5: -24117532.8764189481735229
Question:
Can someone tell me what’s wrong in my code?