Why are the calculations different between CPU and GPU?
Are there anyone who know the cause and how to make the results same?
Test Device : Titan-X , Quadro RTX 6000
DIFF(11) / 5.7718768120 -5.4894809723 -0.4568047523 : CPU(63.6576385498) GPU(63.6576347351)
DIFF(17) / 7.0205116272 -3.9807739258 0.9889754057 : CPU(66.1122131348) GPU(66.1122207642)
DIFF(21) / 6.2994036674 -4.5103206635 1.7197880745 : CPU(62.9831542969) GPU(62.9831504822)
DIFF(25) / 5.3536248207 -5.5441226959 0.7952759862 : CPU(60.0310630798) GPU(60.0310592651)
DIFF(26) / 5.4337601662 -5.4639873505 2.3032853603 : CPU(64.6860275269) GPU(64.6860351562)
DIFF(28) / 7.2088255882 -3.6889219284 2.8629994392 : CPU(73.7720794678) GPU(73.7720718384)
DIFF(31) / 7.6020116806 -2.9594936371 3.0124745369 : CPU(75.6241836548) GPU(75.6241912842)
DIFF(35) / 9.5662813187 -0.9373407364 2.2401032448 : CPU(97.4104156494) GPU(97.4104080200)
DIFF(37) / 10.5662345886 0.3695564270 2.8683333397 : CPU(120.0092239380) GPU(120.0092163086)
#include <stdio.h>
float data[][5] = {
{4.6963300705, -6.4615783691, 0.0183835626, 63.8078498840},
{4.7764654160, -6.3814430237, 1.5263929367, 65.8673095703},
{5.3628854752, -5.7950229645, 2.2014012337, 67.1889953613},
{6.5515308380, -4.6063776016, 2.0861070156, 68.4931182861},
{4.6488780975, -6.5090303421, 2.8611006737, 72.1654434204},
{4.8525705338, -6.4884567261, -1.3599994183, 67.4971160889},
{4.9327058792, -6.4083213806, 0.1480098963, 65.4200744629},
{5.5191259384, -5.8219013214, 0.8230181932, 65.0326461792},
{6.7077713013, -4.6332559586, 0.7077239752, 66.9621276855},
{4.8051185608, -6.5359086990, 1.4827176332, 68.0057144165},
{5.6917414665, -5.5696163177, -1.9648141861, 67.2770385742},
{5.7718768120, -5.4894809723, -0.4568047523, 63.6576347351},
{6.3582968712, -4.9030609131, 0.2182035446, 64.5155639648},
{7.5469422340, -3.7144155502, 0.1029093266, 70.7638092041},
{5.6442894936, -5.6170682907, 0.8779029846, 64.1801757812},
{6.3539562225, -4.6473293304, -1.1940422058, 63.3961639404},
{6.4340915680, -4.5671939850, 0.3139671087, 62.3553695679},
{7.0205116272, -3.9807739258, 0.9889754057, 66.1122207642},
{8.2091569901, -2.7921285629, 0.8736811876, 75.9495620728},
{6.3065042496, -4.6947813034, 1.6486748457, 64.5310974121},
{6.2192683220, -4.5904560089, 0.2117787600, 59.7964363098},
{6.2994036674, -4.5103206635, 1.7197880745, 62.9831504822},
{6.8858237267, -3.9239006042, 2.3947963715, 68.5466156006},
{8.0744686127, -2.7352552414, 2.2795021534, 77.8747940063},
{6.1718163490, -4.6379079819, 3.0544958115, 68.9314575195},
{5.3536248207, -5.5441226959, 0.7952759862, 60.0310592651},
{5.4337601662, -5.4639873505, 2.3032853603, 64.6860351562},
{6.0201802254, -4.8775672913, 2.9782936573, 68.9034652710},
{7.2088255882, -3.6889219284, 2.8629994392, 73.7720718384},
{5.3061728477, -5.5915746689, 3.6379930973, 72.6561737061},
{7.0155916214, -3.5459136963, 2.3374662399, 67.2557754517},
{7.6020116806, -2.9594936371, 3.0124745369, 75.6241912842},
{8.7906570435, -1.7708482742, 2.8971803188, 88.8052062988},
{6.8880043030, -3.6735010147, 3.6721739769, 74.4240722656},
{8.3776359558, -2.1259860992, 2.3553977013, 80.2524948120},
{9.5662813187, -0.9373407364, 2.2401032448, 97.4104080200},
{7.6636285782, -2.8399934769, 3.0150971413, 75.8875808716},
{10.5662345886, 0.3695564270, 2.8683333397, 120.0092163086},
{8.6635818481, -1.5330963135, 3.6433269978, 90.6818695068},
{7.5730867386, -3.2432613373, -0.0125946999, 67.8705444336},
{8.1595067978, -2.6568412781, 0.6624135971, 74.0751495361},
{9.3481521606, -1.4681959152, 0.5471193790, 89.8428878784},
{7.4454994202, -3.3708486557, 1.3221130371, 68.5460662842}
};
__global__ void multiple_dev (float (*d)[5])
{
for (int i = 0; i < 42; i++) {
d[i][4] = d[i][0]*d[i][0] + d[i][1]*d[i][1] + d[i][2]*d[i][2];
}
}
void multiple_host (float (*d)[5])
{
for (int i = 0; i < 42; i++) {
d[i][3] = d[i][0]*d[i][0] + d[i][1]*d[i][1] + d[i][2]*d[i][2];
}
}
int main()
{
// distance square in CPU
multiple_host(data);
// distance square in GPU
float (*dev_data) [5];
cudaMalloc( (void**)&dev_data, sizeof(data) );
cudaMemcpy( dev_data, data, sizeof(data), cudaMemcpyHostToDevice);
multiple_dev <<<1, 1>>> (dev_data);
cudaMemcpy( data, dev_data, sizeof(data), cudaMemcpyDeviceToHost);
// check the results from CPU and GPU
for (int i = 0; i < 42; i++) {
if (data[i][3] != data[i][4]) {
printf ("DIFF(%02d) / %.10f %.10f %.10f : CPU(%.10f) GPU(%.10f) \n"
, i, data[i][0], data[i][1], data[i][2], data[i][3], data[i][4]);
}
}
return 0;
}