Hi all,
I have a simple cuda program which executed on the CPU and on the GPU, gives different results when the GPU is a Tesla card and gives the same results when the GPU is a Fermi card.
Does anybody know any explanation for the difference of the results?
I post also the code of my program:
/* File: testDefinitions.h */
#define MVMul(v1, m, v2)
(v1).x = (m)[0] * (v2).x + (m)[3] * (v2).y + (m)[6] * (v2).z,
(v1).y = (m)[1] * (v2).x + (m)[4] * (v2).y + (m)[7] * (v2).z,
(v1).z = (m)[2] * (v2).x + (m)[5] * (v2).y + (m)[8] * (v2).z
typedef struct {
float x, y, z;
} triple;
typedef struct {
float u[9];
} ninuple;
/* File testc.c */
#include
#include
#include
#include “testDefinitions.h”
float uNewGpuHost[3];
void preparation_gpu_matrix_product(triple, ninuple);
// main routine that executes on the host
int main(int argc, char **argv) {
triple uOld, uNewGpu, uNewHost;
ninuple Ru;
uOld.x = 2.39282537;
uOld.y = 0.653135121;
uOld.z = -0.224300578;
Ru.u[0] = 1;
Ru.u[1] = 0;
Ru.u[2] = 0;
Ru.u[3] = 0;
Ru.u[4] = 0.999999523;
Ru.u[5] = -0.000974848808;
Ru.u[6] = 0;
Ru.u[7] = 0.000974848808;
Ru.u[8] = 0.999999523;
MVMul(uNewHost, Ru.u, uOld);
printf(“\nValue of uNew calculated by host: %.20f %.20f %.20f\n”, uNewHost.x, uNewHost.y, uNewHost.z);
preparation_gpu_matrix_product(uOld, Ru);
uNewGpu.x = uNewGpuHost[0];
uNewGpu.y = uNewGpuHost[1];
uNewGpu.z = uNewGpuHost[2];
printf(“Value of uNew calculated by GPU: %.20f %.20f %.20f\n”, uNewGpu.x, uNewGpu.y, uNewGpu.z);
if(((uNewHost.x-uNewGpu.x)==0) && ((uNewHost.y-uNewGpu.y)==0) && ((uNewHost.z-uNewGpu.z)==0)) {
printf(“Test PASSED\n”);
}
else
printf(“Test FAILED\n”);
return 0;
}
/* File test.cu */
#include
#include
global void matrix_product(triple, ninuple, float*);
extern float uNewGpuHost[3];
extern “C” {
void preparation_gpu_matrix_product(triple uOld, ninuple Ru) {
float* uNewDev;
cudaMalloc((void **) &uNewDev, 3*sizeof(float)); // Allocate array on device
cudaError_t error=cudaGetLastError();
if(error != cudaSuccess)printf(“cudaMalloc: %s\n”,cudaGetErrorString(error));
// Do calculation on device:
matrix_product <<< 1, 16 >>> (uOld, Ru, uNewDev);
cudaThreadSynchronize();
error=cudaGetLastError();
if(error != cudaSuccess)printf(“Synchronize kernel: %s\n”,cudaGetErrorString(error));
// Retrieve result from device and store it in host array
cudaMemcpy(uNewGpuHost, uNewDev, sizeof(float)*3, cudaMemcpyDeviceToHost);
error=cudaGetLastError();
if(error != cudaSuccess)printf(“cuda copy device to host: %s\n”,cudaGetErrorString(error));
cudaFree(uNewDev);
}
}
global void matrix_product(triple uOld, ninuple Ru, float* uNewDev) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if(idx==0) {
triple uNew;
MVMul(uNew, Ru.u, uOld);
uNewDev[0]=uNew.x;
uNewDev[1]=uNew.y;
uNewDev[2]=uNew.z;
}
}
The results of this simple program targeted to a Tesla card: x295 are:
Value of uNew calculated by host: 2.39282536506652832031 0.65291619300842285156 -0.22493718564510345459
Value of uNew calculated by GPU: 2.39282536506652832031 0.65291619300842285156 -0.22493717074394226074
Test FAILED
insted when run on a Fermi card the results are:
Value of uNew calculated by host: 2.39282536506652832031 0.65291619300842285156 -0.22493718564510345459
Value of uNew calculated by GPU: 2.39282536506652832031 0.65291619300842285156 -0.22493718564510345459
Test PASSED
Does anybody know why?
Thank you in advance,
Ardita
testDefinitions.h (393 Bytes)
test.cu (1.22 KB)