Hello,
I have a question on the precision of floating point operations. The following codes shows an example:
[codebox]#include <cuda.h>
#include <stdio.h>
global
void compute(float * data) {
data[0] = data[0] + data[1];
}
int main() {
unsigned int udata1[] = {817504365, 1065158090};
unsigned int udata2[] = {1065158090, 817504365};
float * fdata = (float *)udata1;
float * dfdata;
float debug1, debug2;
cudaMalloc((void **)&dfdata, 2 * sizeof(*dfdata));
cudaMemcpy((void*)dfdata, (void const *)udata1, 2 * sizeof(*fdata), cudaMemcpyHostToDevice);
compute<<<1,1>>>(dfdata);
cudaMemcpy((void*)&debug1, (void const *)dfdata, 1 * sizeof(*fdata), cudaMemcpyDeviceToHost);
cudaMemcpy((void*)dfdata, (void const *)udata2, 2 * sizeof(*fdata), cudaMemcpyHostToDevice);
compute<<<1,1>>>(dfdata);
cudaMemcpy((void*)&debug2, (void const *)dfdata, 1 * sizeof(*fdata), cudaMemcpyDeviceToHost);
float temp = fdata[0] + fdata[1];
printf("CPU: %.20f GPU1: %.20f GPU2: %.20f\n", temp, debug1, debug2);
printf("CPU: %u GPU1: %u GPU2: %u\n", *(unsigned int *)&temp, *(unsigned int *)&debug1, *(unsigned int *)&debug2);
cudaFree(dfdata);
}[/codebox]
Output:
[codebox]CPU: 0.98836958408355712891 GPU1: 0.98836958408355712891 GPU2: 0.98836958408355712891
CPU: 1065158090 GPU1: 1065158090 GPU2: 1065158090
[/codebox]
every thing is fine; commutativity works (a+b==b+a)
[codebox]#include <cuda.h>
#include <stdio.h>
global
void compute(float * data) {
data[0] = data[0] * data[1] + data[2] * data[3];
}
int main() {
unsigned int udata1[] = {893268102, 989497344, 1065253733, 1065257002};
unsigned int udata2[] = {1065253733, 1065257002, 893268102, 989497344};
float * fdata = (float *)udata1;
float * dfdata;
float debug1, debug2;
cudaMalloc((void **)&dfdata, 6 * sizeof(*dfdata));
cudaMemcpy((void*)dfdata, (void const *)udata1, 6 * sizeof(*fdata), cudaMemcpyHostToDevice);
compute<<<1,1>>>(dfdata);
cudaMemcpy((void*)&debug1, (void const *)dfdata, 1 * sizeof(*fdata), cudaMemcpyDeviceToHost);
cudaMemcpy((void*)dfdata, (void const *)udata2, 6 * sizeof(*fdata), cudaMemcpyHostToDevice);
compute<<<1,1>>>(dfdata);
cudaMemcpy((void*)&debug2, (void const *)dfdata, 1 * sizeof(*fdata), cudaMemcpyDeviceToHost);
float temp = fdata[0] * fdata[1] + fdata[2] * fdata[3];
printf("CPU: %.20f GPU1: %.20f GPU2: %.20f\n", temp, debug1, debug2);
printf("CPU: %u GPU1: %u GPU2: %u\n", *(unsigned int *)&temp, *(unsigned int *)&debug1, *(unsigned int *)&debug2);
cudaFree(dfdata);
}[/codebox]
Output:
[codebox]CPU: 0.98836958408355712891 GPU1: 0.98836958408355712891 GPU2: 0.98836952447891235352
CPU: 1065158090 GPU1: 1065158090 GPU2: 1065158089
[/codebox]
ab + cd != cd + ab?
The kernel does the first multiplication using the mul instruction and the second with the mad instruction. Both multiplications are done separately. But this time the add is included in the mad.
Can anyone explain me why are the results are different?
Moritz