Hi fellows,
I’m trying to find a cause of a deviation in a result of simple math operation on GTX 260 comparing to that on Opteron 285 (reference). The deviation is introduced only if shared memory is involved. Below are 2 cases. In the 1st case GTX 260 produces the same result as the Opteron, in the second case it gives 2 bit difference from the reference. As you can see from ptx code math operations seemed to be the same in both cases. The only difference is that parameters in the 1st case loaded into registers directly as literals whereas in the 2nd case they go through shared memory. Few things come in mind: me being a bad boy, shared memory load being imprecise (by design or bug) or something else. If you know a way to correct it without eliminating shared memory from the code, please let me know. Thank you.
1st case. GTX 260 result is the same as reference (binary BD 7C AF C1):
[codebox] if(t_id == 0 && b_id == 0 && step ==0)
{
float reg0, reg1, reg2, reg3;
*((int *)(®0)) = 0x3BF5C28F;
*((int *)(®1)) = 0xC1180000;
*((int *)(®2)) = 0x3F62CB81;
*((int *)(®3)) = 0xBE4352C3;
*((float *)&d_err_code[11]) = reg0 * (reg1 * reg2 - reg3);
}
.loc 15 1509 0
mov.s32 %r55, 1005961871;
mov.b32 %f3, %r55;
.loc 15 1510 0
mov.s32 %r56, -1055391744;
mov.b32 %f4, %r56;
.loc 15 1511 0
mov.s32 %r57, 1063439233;
mov.b32 %f5, %r57;
.loc 15 1512 0
mov.s32 %r58, -1102884157;
mov.b32 %f6, %r58;
.loc 15 1513 0
mov.f32 %f7, %f3;
mov.f32 %f8, %f6;
mov.f32 %f9, %f4;
mov.f32 %f10, %f5;
mul.f32 %f11, %f9, %f10;
sub.f32 %f12, %f11, %f8;
mul.f32 %f13, %f7, %f12;[/codebox]
2nd case. GTX 260 result is different by 2 bits from the reference (binary BD 7C AF BF):
[codebox] if(t_id == 0 && b_id == 0 && step ==0)
{
__shared__ float reg[5];
*((int *)(®[0])) = 0x3BF5C28F;
*((int *)(®[1])) = 0xC1180000;
*((int *)(®[2])) = 0x3F62CB81;
*((int *)(®[3])) = 0xBE4352C3;
*((float *)&d_err_code[11]) = reg[0] * (reg[1] * reg[2] - reg[3]);
}
.loc 15 1509 0
mov.s32 %r55, 1005961871;
st.shared.s32 [reg+0], %r55;
.loc 15 1510 0
mov.s32 %r56, -1055391744;
st.shared.s32 [reg+4], %r56;
.loc 15 1511 0
mov.s32 %r57, 1063439233;
st.shared.s32 [reg+8], %r57;
.loc 15 1512 0
mov.s32 %r58, -1102884157;
st.shared.s32 [reg+12], %r58;
.loc 15 1513 0
ld.shared.f32 %f3, [reg+0];
ld.shared.f32 %f4, [reg+12];
ld.shared.f32 %f5, [reg+4];
ld.shared.f32 %f6, [reg+8];
mul.f32 %f7, %f5, %f6;
sub.f32 %f8, %f7, %f4;
mul.f32 %f9, %f3, %f8;[/codebox]