Different results between GPU and CPU different when program runs on Tesla card and same results wh

Hi all,

I have a simple cuda program which executed on the CPU and on the GPU, gives different results when the GPU is a Tesla card and gives the same results when the GPU is a Fermi card.

Does anybody know any explanation for the difference of the results?

I post also the code of my program:

/* File: testDefinitions.h */
#define MVMul(v1, m, v2)
(v1).x = (m)[0] * (v2).x + (m)[3] * (v2).y + (m)[6] * (v2).z,
(v1).y = (m)[1] * (v2).x + (m)[4] * (v2).y + (m)[7] * (v2).z,
(v1).z = (m)[2] * (v2).x + (m)[5] * (v2).y + (m)[8] * (v2).z

typedef struct {
float x, y, z;
} triple;

typedef struct {
float u[9];
} ninuple;

/* File testc.c */
#include
#include
#include
#include “testDefinitions.h”

float uNewGpuHost[3];
void preparation_gpu_matrix_product(triple, ninuple);

// main routine that executes on the host
int main(int argc, char **argv) {
triple uOld, uNewGpu, uNewHost;
ninuple Ru;
uOld.x = 2.39282537;
uOld.y = 0.653135121;
uOld.z = -0.224300578;
Ru.u[0] = 1;
Ru.u[1] = 0;
Ru.u[2] = 0;
Ru.u[3] = 0;
Ru.u[4] = 0.999999523;
Ru.u[5] = -0.000974848808;
Ru.u[6] = 0;
Ru.u[7] = 0.000974848808;
Ru.u[8] = 0.999999523;

MVMul(uNewHost, Ru.u, uOld);
printf("\nValue of uNew calculated by host: %.20f %.20f %.20f\n", uNewHost.x, uNewHost.y, uNewHost.z);
preparation_gpu_matrix_product(uOld, Ru);

uNewGpu.x = uNewGpuHost[0];
uNewGpu.y = uNewGpuHost[1];
uNewGpu.z = uNewGpuHost[2];

printf(“Value of uNew calculated by GPU: %.20f %.20f %.20f\n”, uNewGpu.x, uNewGpu.y, uNewGpu.z);

if(((uNewHost.x-uNewGpu.x)==0) && ((uNewHost.y-uNewGpu.y)==0) && ((uNewHost.z-uNewGpu.z)==0)) {
printf(“Test PASSED\n”);
}
else
printf(“Test FAILED\n”);
return 0;
}

/* File test.cu */
#include
#include

global void matrix_product(triple, ninuple, float*);

extern float uNewGpuHost[3];

extern “C” {
void preparation_gpu_matrix_product(triple uOld, ninuple Ru) {
float* uNewDev;

cudaMalloc((void **) &uNewDev, 3*sizeof(float)); // Allocate array on device
cudaError_t error=cudaGetLastError();
if(error != cudaSuccess)printf(“cudaMalloc: %s\n”,cudaGetErrorString(error));

// Do calculation on device:
matrix_product <<< 1, 16 >>> (uOld, Ru, uNewDev);
cudaThreadSynchronize();
error=cudaGetLastError();
if(error != cudaSuccess)printf(“Synchronize kernel: %s\n”,cudaGetErrorString(error));

// Retrieve result from device and store it in host array

cudaMemcpy(uNewGpuHost, uNewDev, sizeof(float)*3, cudaMemcpyDeviceToHost);
error=cudaGetLastError();
if(error != cudaSuccess)printf(“cuda copy device to host: %s\n”,cudaGetErrorString(error));

cudaFree(uNewDev);
}
}

global void matrix_product(triple uOld, ninuple Ru, float* uNewDev) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if(idx==0) {
triple uNew;
MVMul(uNew, Ru.u, uOld);
uNewDev[0]=uNew.x;
uNewDev[1]=uNew.y;
uNewDev[2]=uNew.z;
}
}

The results of this simple program targeted to a Tesla card: x295 are:
Value of uNew calculated by host: 2.39282536506652832031 0.65291619300842285156 -0.22493718564510345459
Value of uNew calculated by GPU: 2.39282536506652832031 0.65291619300842285156 -0.22493717074394226074
Test FAILED

insted when run on a Fermi card the results are:
Value of uNew calculated by host: 2.39282536506652832031 0.65291619300842285156 -0.22493718564510345459
Value of uNew calculated by GPU: 2.39282536506652832031 0.65291619300842285156 -0.22493718564510345459
Test PASSED

Does anybody know why?

Thank you in advance,
Ardita
testDefinitions.h (393 Bytes)
test.cu (1.22 KB)

Hi all,

I have a simple cuda program which executed on the CPU and on the GPU, gives different results when the GPU is a Tesla card and gives the same results when the GPU is a Fermi card.

Does anybody know any explanation for the difference of the results?

I post also the code of my program:

/* File: testDefinitions.h */
#define MVMul(v1, m, v2)
(v1).x = (m)[0] * (v2).x + (m)[3] * (v2).y + (m)[6] * (v2).z,
(v1).y = (m)[1] * (v2).x + (m)[4] * (v2).y + (m)[7] * (v2).z,
(v1).z = (m)[2] * (v2).x + (m)[5] * (v2).y + (m)[8] * (v2).z

typedef struct {
float x, y, z;
} triple;

typedef struct {
float u[9];
} ninuple;

/* File testc.c */
#include
#include
#include
#include “testDefinitions.h”

float uNewGpuHost[3];
void preparation_gpu_matrix_product(triple, ninuple);

// main routine that executes on the host
int main(int argc, char **argv) {
triple uOld, uNewGpu, uNewHost;
ninuple Ru;
uOld.x = 2.39282537;
uOld.y = 0.653135121;
uOld.z = -0.224300578;
Ru.u[0] = 1;
Ru.u[1] = 0;
Ru.u[2] = 0;
Ru.u[3] = 0;
Ru.u[4] = 0.999999523;
Ru.u[5] = -0.000974848808;
Ru.u[6] = 0;
Ru.u[7] = 0.000974848808;
Ru.u[8] = 0.999999523;

MVMul(uNewHost, Ru.u, uOld);
printf("\nValue of uNew calculated by host: %.20f %.20f %.20f\n", uNewHost.x, uNewHost.y, uNewHost.z);
preparation_gpu_matrix_product(uOld, Ru);

uNewGpu.x = uNewGpuHost[0];
uNewGpu.y = uNewGpuHost[1];
uNewGpu.z = uNewGpuHost[2];

printf(“Value of uNew calculated by GPU: %.20f %.20f %.20f\n”, uNewGpu.x, uNewGpu.y, uNewGpu.z);

if(((uNewHost.x-uNewGpu.x)==0) && ((uNewHost.y-uNewGpu.y)==0) && ((uNewHost.z-uNewGpu.z)==0)) {
printf(“Test PASSED\n”);
}
else
printf(“Test FAILED\n”);
return 0;
}

/* File test.cu */
#include
#include

global void matrix_product(triple, ninuple, float*);

extern float uNewGpuHost[3];

extern “C” {
void preparation_gpu_matrix_product(triple uOld, ninuple Ru) {
float* uNewDev;

cudaMalloc((void **) &uNewDev, 3*sizeof(float)); // Allocate array on device
cudaError_t error=cudaGetLastError();
if(error != cudaSuccess)printf(“cudaMalloc: %s\n”,cudaGetErrorString(error));

// Do calculation on device:
matrix_product <<< 1, 16 >>> (uOld, Ru, uNewDev);
cudaThreadSynchronize();
error=cudaGetLastError();
if(error != cudaSuccess)printf(“Synchronize kernel: %s\n”,cudaGetErrorString(error));

// Retrieve result from device and store it in host array

cudaMemcpy(uNewGpuHost, uNewDev, sizeof(float)*3, cudaMemcpyDeviceToHost);
error=cudaGetLastError();
if(error != cudaSuccess)printf(“cuda copy device to host: %s\n”,cudaGetErrorString(error));

cudaFree(uNewDev);
}
}

global void matrix_product(triple uOld, ninuple Ru, float* uNewDev) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if(idx==0) {
triple uNew;
MVMul(uNew, Ru.u, uOld);
uNewDev[0]=uNew.x;
uNewDev[1]=uNew.y;
uNewDev[2]=uNew.z;
}
}

The results of this simple program targeted to a Tesla card: x295 are:
Value of uNew calculated by host: 2.39282536506652832031 0.65291619300842285156 -0.22493718564510345459
Value of uNew calculated by GPU: 2.39282536506652832031 0.65291619300842285156 -0.22493717074394226074
Test FAILED

insted when run on a Fermi card the results are:
Value of uNew calculated by host: 2.39282536506652832031 0.65291619300842285156 -0.22493718564510345459
Value of uNew calculated by GPU: 2.39282536506652832031 0.65291619300842285156 -0.22493718564510345459
Test PASSED

Does anybody know why?

Thank you in advance,
Ardita

what tesla card are you using?

btw, never compare floats like this

if(((uNewHost.x-uNewGpu.x)==0) && ((uNewHost.y-uNewGpu.y)==0) && ((uNewHost.z-uNewGpu.z)==0))

what tesla card are you using?

btw, never compare floats like this

if(((uNewHost.x-uNewGpu.x)==0) && ((uNewHost.y-uNewGpu.y)==0) && ((uNewHost.z-uNewGpu.z)==0))

I am using a GTX295 tesla card. How to compare the floats then?

Thank you for your answer,

Ardita

I am using a GTX295 tesla card. How to compare the floats then?

Thank you for your answer,

Ardita

wiith some epsilon, float computations have limitted precision.

wiith some epsilon, float computations have limitted precision.