Hi all, I am currently using cuda on a 3.0 compute capabilities accelerator (GTX680) for computation of 32 bit floats number, and I discovered some problems when comparing GPU and CPU computation results.
The weird thing append when I play with nvcc compiling options: while using
“-gencode arch=compute_30,code=sm_30” option I always obtain a rounded result.
But when I use nvcc -G option (device debug that I don’t use because it returns error due to graphical processing is processed in the same time) the floating point computation return the same value as the CPU.
A very simple example of kernel that shows an error:
__global__ void calc(float* fpDeviceBuffer, float a, float b, unsigned short c, float d)
{
*fpDeviceBuffer=a*(d+(b*c));
}
with Host code:
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
//cuda
#include <helper_cuda.h>
__global__ void calc(float* fpDeviceBuffer, float a, float b, unsigned short c, float d);
float a = 6708.0f;
float b = (4.0f - 0.25f)/( powf(2.0f, 16.0f) - 1.0f );
unsigned short c = 17278;
float d = 0.250000f;
float *fpHostBuffer = NULL;
float *fpDeviceBuffer = NULL;
int main(int argc, char* argv[])
{
float result=a*(d+(c*b));
//printf("CPU result %a\n",result);
printf("CPU result %f\n",result);
//page-locked memory
checkCudaErrors(cudaMallocHost(&fpHostBuffer,sizeof(float)));
//Device memory
checkCudaErrors(cudaMalloc(&fpDeviceBuffer,sizeof(float)));
calc<<<1,1>>>(fpDeviceBuffer,a,b,c,d);
checkCudaErrors( cudaMemcpy ( fpHostBuffer, fpDeviceBuffer, sizeof(float), cudaMemcpyDeviceToHost) );
printf("GPU result %a\n",*fpHostBuffer);
printf("GPU result %f\n",*fpHostBuffer);
checkCudaErrors(cudaFree(fpDeviceBuffer));
checkCudaErrors(cudaFreeHost(fpHostBuffer));
return(EXIT_SUCCESS);
}
result is
GPU result 0x1.03a7fep+13
GPU result 8308.999023
GPU result 0x1.03a8p+13
GPU result 8309.000000
When no debug
and
GPU result 0x1.03a7fep+13
GPU result 8308.999023
GPU result 0x1.03a7fep+13
GPU result 8308.999023
When -G option is used
I haven’t found many informations related to this issue in the Nvidia cuda compiler documentation, any idea ?