shared memory load affects precision of floating point computation

Hi fellows,

I’m trying to find a cause of a deviation in a result of simple math operation on GTX 260 comparing to that on Opteron 285 (reference). The deviation is introduced only if shared memory is involved. Below are 2 cases. In the 1st case GTX 260 produces the same result as the Opteron, in the second case it gives 2 bit difference from the reference. As you can see from ptx code math operations seemed to be the same in both cases. The only difference is that parameters in the 1st case loaded into registers directly as literals whereas in the 2nd case they go through shared memory. Few things come in mind: me being a bad boy, shared memory load being imprecise (by design or bug) or something else. If you know a way to correct it without eliminating shared memory from the code, please let me know. Thank you.

1st case. GTX 260 result is the same as reference (binary BD 7C AF C1):

[codebox] if(t_id == 0 && b_id == 0 && step ==0)

{

float reg0, reg1, reg2, reg3;

*((int *)(&reg0)) = 0x3BF5C28F;

*((int *)(&reg1)) = 0xC1180000;

*((int *)(&reg2)) = 0x3F62CB81;

*((int *)(&reg3)) = 0xBE4352C3;

*((float *)&d_err_code[11]) = reg0 * (reg1 * reg2 - reg3);

}

.loc	15	1509	0

mov.s32 	%r55, 1005961871;

mov.b32 	%f3, %r55;

.loc	15	1510	0

mov.s32 	%r56, -1055391744;

mov.b32 	%f4, %r56;

.loc	15	1511	0

mov.s32 	%r57, 1063439233;

mov.b32 	%f5, %r57;

.loc	15	1512	0

mov.s32 	%r58, -1102884157;

mov.b32 	%f6, %r58;

.loc	15	1513	0

mov.f32 	%f7, %f3;

mov.f32 	%f8, %f6;

mov.f32 	%f9, %f4;

mov.f32 	%f10, %f5;

mul.f32 	%f11, %f9, %f10;

sub.f32 	%f12, %f11, %f8;

mul.f32 	%f13, %f7, %f12;[/codebox]

2nd case. GTX 260 result is different by 2 bits from the reference (binary BD 7C AF BF):

[codebox] if(t_id == 0 && b_id == 0 && step ==0)

{

__shared__ float       reg[5];

*((int *)(&reg[0])) = 0x3BF5C28F;

*((int *)(&reg[1])) = 0xC1180000;

*((int *)(&reg[2])) = 0x3F62CB81;

*((int *)(&reg[3])) = 0xBE4352C3;

*((float *)&d_err_code[11]) = reg[0] * (reg[1] * reg[2] - reg[3]);

}

.loc	15	1509	0

mov.s32 	%r55, 1005961871;

st.shared.s32 	[reg+0], %r55;

.loc	15	1510	0

mov.s32 	%r56, -1055391744;

st.shared.s32 	[reg+4], %r56;

.loc	15	1511	0

mov.s32 	%r57, 1063439233;

st.shared.s32 	[reg+8], %r57;

.loc	15	1512	0

mov.s32 	%r58, -1102884157;

st.shared.s32 	[reg+12], %r58;

.loc	15	1513	0

ld.shared.f32 	%f3, [reg+0];

ld.shared.f32 	%f4, [reg+12];

ld.shared.f32 	%f5, [reg+4];

ld.shared.f32 	%f6, [reg+8];

mul.f32 	%f7, %f5, %f6;

sub.f32 	%f8, %f7, %f4;

mul.f32 	%f9, %f3, %f8;[/codebox]

Consider “bracketing” the “relational operations” in the “IF” condition for more readablity…

Since only one thread is doing the math – the behaviour is very hard to explain…

Thank you. Good point. I tried with all threads but still deviation is present:

1st case. GTX 260 result is the same as reference (binary BD 7C AF C1):

[codebox] float reg0, reg1, reg2, reg3;

*((int *)(&reg0)) = 0x3BF5C28F;

*((int *)(&reg1)) = 0xC1180000;

*((int *)(&reg2)) = 0x3F62CB81;

*((int *)(&reg3)) = 0xBE4352C3;

d_trace_v[b_id*GRID_SIZE_UPDT + t_id] = reg0 * (reg1 * reg2 - reg3);

.loc	15	1947	0

mov.s32 	%r468, 1005961871;

mov.b32 	%f351, %r468;

.loc	15	1948	0

mov.s32 	%r469, -1055391744;

mov.b32 	%f352, %r469;

.loc	15	1949	0

mov.s32 	%r470, 1063439233;

mov.b32 	%f353, %r470;

.loc	15	1950	0

mov.s32 	%r471, -1102884157;

mov.b32 	%f354, %r471;

.loc	15	1952	0

mov.f32 	%f355, %f351;

mov.f32 	%f356, %f354;

mov.f32 	%f357, %f352;

mov.f32 	%f358, %f353;

mul.f32 	%f359, %f357, %f358;

sub.f32 	%f360, %f359, %f356;

mul.f32 	%f361, %f355, %f360;

ld.param.u64 	%rd294, [__cudaparm__Z23update_kernelPiPfmS0_PjS_S0_S0_S0__d_trace_v

];

mov.s32 	%r472, %r2;

mov.s32 	%r473, %r4;

mul24.lo.u32 	%r474, %r473, 48;

add.u32 	%r475, %r472, %r474;

cvt.u64.u32 	%rd295, %r475;

mul.lo.u64 	%rd296, %rd295, 4;

add.u64 	%rd297, %rd294, %rd296;

st.global.f32 	[%rd297+0], %f361;[/codebox]

2nd case. GTX 260 result is different by 2 bits from the reference (binary BD 7C AF BF):

[codebox] shared float reg[5];

if( t_id == 0 )

{

*((int *)(&reg[0])) = 0x3BF5C28F;

*((int *)(&reg[1])) = 0xC1180000;

*((int *)(&reg[2])) = 0x3F62CB81;

*((int *)(&reg[3])) = 0xBE4352C3;

}

__syncthreads();

d_trace_v[b_id*GRID_SIZE_UPDT + t_id] = reg[0] * (reg[1] * reg[2] - reg[3]);

.loc	15	1949	0

mov.s32 	%r470, 1005961871;

st.shared.s32 	[reg+0], %r470;

.loc	15	1950	0

mov.s32 	%r471, -1055391744;

st.shared.s32 	[reg+4], %r471;

.loc	15	1951	0

mov.s32 	%r472, 1063439233;

st.shared.s32 	[reg+8], %r472;

.loc	15	1952	0

mov.s32 	%r473, -1102884157;

st.shared.s32 	[reg+12], %r473;

$Lt_1_111106:

.loc	15	1955	0

bar.sync 	0;

.loc	15	1957	0

ld.shared.f32 	%f351, [reg+0];

ld.shared.f32 	%f352, [reg+12];

ld.shared.f32 	%f353, [reg+4];

ld.shared.f32 	%f354, [reg+8];

mul.f32 	%f355, %f353, %f354;

sub.f32 	%f356, %f355, %f352;

mul.f32 	%f357, %f351, %f356;

ld.param.u64 	%rd294, [__cudaparm__Z23update_kernelPiPfmS0_PjS_S0_S0_S0__d_trace_v

];

mov.s32 	%r474, %r2;

mov.s32 	%r475, %r4;

mul24.lo.u32 	%r476, %r475, 48;

add.u32 	%r477, %r474, %r476;

cvt.u64.u32 	%rd295, %r477;

mul.lo.u64 	%rd296, %rd295, 4;

add.u64 	%rd297, %rd294, %rd296;

st.global.f32 	[%rd297+0], %f357;[/codebox]

This is really confusing… I am hoping some more knowledgeable person to update this thread.

Also it is possible that the way you are “casting” it - has a problem with the “values” u have used…? Check the PTX for more clarity and see if u can reproduce the behaviour with small float numbers…

As far as I understand it, there is another compilation stage between PTX code and what the GPU actually executes.

If you look up for example mul in ptx_isa_1.4.pdf (page 60, PDF page 72) you’ll find:

It seems plausible to me that the optimizer uses fma in the register case, but leaves the separate operations if operands needs to be fetched from shared memory one by one.

For now that’s only guessing, I haven’t verified that.

Obviously, this could be verified by adding an explicit rounding mode specifier (or even by a flag for nvcc to control the ptx code generation???).

Thank you. Yes, you are correct: changing mul, add, div to their versions with round to nearest even resulted in 100% match of CPU and GPU execution for the entire simulation for all variables.

This is just brilliant guys! Good thread!