Precision and rounding in float point registers.

Hi All,

I’m trying to figure out how the .rn, .rm, .rp, .rz rounding qualifiers work on floating points in PTX. The documentation of them is pretty sketchy (see the “PTX: Parallel Thread Execution” PDF, the ADD instruction for example). Why? I’ve written a CUDA emulator to run on Windows, and wanted to figure out how these strange rounding options really work, so I could try to do the same. So, I wrote a small PTX/CUDA driver program that tests the difference between 1 and 1+1/2^n for some positive integer n, using 32-bit floating point registers, and the ADD{.rnd}.F32 and SETP.EQ.F32 instructions. Eventually, for some n, I expect no difference between the two numbers because the mantissa can hold only 23 bits. Otherwise, why even offer these rounding options on instructions?

I ran my program on a GT 470 GPU. What I expected was to see no difference between 1 and 1+1/2^23 in a 32-bit floating point register. For .rn, .rm, .rz rounding, there was no difference, which is what I expected. (I may be off by 1, so please be kind.) But for .rp, it seems that registers retain a mantissa of 150 digits. While I can understand computing floating points with a higher precision than 32-bits within the floating point unit, then truncating the results to 32 bits, I didn’t expect to see that the GPU computes it to a 150-bit mantissa. And, especially to find that 32-bit f.p. registers are not actually 32-bits!! (Seeing is believing–look at the code.)

Am I correct? Is a 32-bit f.p. register really NOT 32 bits?! And is this IEEE 754-2008 floating point conforming? As you can tell, I’m not an IEEE 754-2008 expert, and I don’t have a copy of the spec. And, I don’t understand what the doc means by “Instructions that support rounding modifiers are IEEE-754 compliant. Double-precision instructions support subnormal inputs and results.” Is there any better discussion of floating point calculations on the Fermi?

Ken

Here’s my code (C++, and PTX) and output:

[codebox]#include <stdio.h>

#include

#include <cuda.h>

void test(int x, const char * s)

{

int t = x;

if (t != 0)

{

	std::cout << "fail " << t << " " << s << "\n";

	std::cout.flush();

}

}

#define ALIGN_UP(offset, alignment) (offset) = ((offset) + (alignment) - 1) & ~((alignment) - 1)

void load_and_test_float(CUmodule cuModule, char * test_name)

{

try

{

	CUfunction proc;

	test(cuModuleGetFunction(&proc, cuModule, test_name), "cuModuleGetFunction");

	int max = 1000;

	float * h_R = (float*)malloc(max * sizeof(float));

	memset(h_R, 0, max * sizeof(float));

	CUdeviceptr d_R;

	test(cuMemAlloc(&d_R, max * sizeof(float)), "cuMemAlloc");

	test(cuMemcpyHtoD(d_R, h_R, sizeof(int)), "cuMemcpyHtoD");

	CUdeviceptr d_N;

	int h_N = 0;

	test(cuMemAlloc(&d_N, sizeof(int)), "cuMemAlloc");

	test(cuMemcpyHtoD(d_N, &h_N, sizeof(int)), "cuMemcpyHtoD");

	int offset = 0;

	void* ptr;

	

	ptr = (void*)(size_t)d_R;

	ALIGN_UP(offset, __alignof(ptr));

	test(cuParamSetv(proc, offset, &ptr, sizeof(ptr)), "cuParamSetv");

	offset += sizeof(ptr);

	

	ptr = (void*)(size_t)d_N;

	ALIGN_UP(offset, __alignof(ptr));

	test(cuParamSetv(proc, offset, &ptr, sizeof(ptr)), "cuParamSetv");

	offset += sizeof(ptr);

	

	test(cuParamSetSize(proc, offset), "cuParamSetSize");

	int threadsPerBlock = 1;

	int blocksPerGrid = 1;

	test(cuFuncSetBlockShape(proc, threadsPerBlock, 1, 1), "cuFuncSetBlockShape");

	test(cuLaunchGrid(proc, blocksPerGrid, 1), "cuLaunchGrid");

	test(cuMemcpyDtoH(h_R, d_R, max * sizeof(float)), "cuMemcpyDtoH");

	test(cuMemcpyDtoH(&h_N, d_N, sizeof(int)), "cuMemcpyDtoH");

	test(cuMemFree(d_R), "cuMemFree");

	test(cuMemFree(d_N), "cuMemFree");

	union FOO

	{

		float f;

		int i;

	} foo;

	for (int i = 0; i < h_N; ++i)

	{

		foo.f = h_R[i];

		printf("%d %f %x\n", i, h_R[i], foo.i);

	}

}

catch (...)

{

	std::string s = test_name;

	s = s.append(" crashed.\n");

	test(1, s.c_str());

}

}

int main(int argc, char *argv)

{

argc--; argv++;

test(cuInit(0), "cuInit");

int deviceCount = 0;

test(cuDeviceGetCount(&deviceCount), "cuDeviceGetCount");

int device = 0;

if (argc)

	device = atoi(*argv);

CUdevice cuDevice = 0;

test(cuDeviceGet(&cuDevice, device), "cuDeviceGet");

CUcontext cuContext;

int xxx = cuCtxCreate(&cuContext, 0, cuDevice);

CUmodule cuModule;

test(cuModuleLoad(&cuModule, "inst.ptx"), "cuModuleLoad");

load_and_test_float(cuModule, "TestRP");

load_and_test_float(cuModule, "TestRZ");

load_and_test_float(cuModule, "TestRN");

load_and_test_float(cuModule, "TestRM");

return 0;

}

[/codebox]

[codebox] .version 2.1

.target sm_20

// .version 1.4

// .target sm_13

.entry TestRP (

	.param .u32 __results,	// Float*

	.param .u32 __N		// N*

)

{

.reg .u32 %r<5>;

.reg .f32 %f32_<5>;

.reg .pred %p<3>;

// Set up

ld.param.u32	%r0,[__results];

mov.u32		%r2, 0;

// rp

mov.f32		%f32_1, 1.0;

again_rp:

mov.f32		%f32_0, 1.0;

div.full.f32	%f32_1, %f32_1, 2.0;

add.rp.f32	%f32_3, %f32_0, %f32_1;

st.global.f32	[%r0], %f32_3;

add.u32		%r2, %r2, 1;

add.u32		%r0, %r0, 4;

setp.ne.f32	%p0, %f32_3, %f32_0;

@%p0 bra again_rp;

end:

// Pass back the number of tests performed.

ld.param.u32	%r1,[__N];

st.global.u32	[%r1], %r2;

exit;

}

.entry TestRZ (

	.param .u32 __results,	// Float*

	.param .u32 __N		// N*

)

{

.reg .u32 %r<5>;

.reg .f32 %f32_<5>;

.reg .pred %p<3>;

// Set up

ld.param.u32	%r0,[__results];

mov.u32		%r2, 0;

// rz

mov.f32		%f32_1, 1.0;

again_rz:

mov.f32		%f32_0, 1.0;

div.full.f32	%f32_1, %f32_1, 2.0;

add.rz.f32	%f32_3, %f32_0, %f32_1;

st.global.f32	[%r0], %f32_3;

add.u32		%r2, %r2, 1;

add.u32		%r0, %r0, 4;

setp.ne.f32	%p0, %f32_3, %f32_0;

@%p0 bra again_rz;

end:

// Pass back the number of tests performed.

ld.param.u32	%r1,[__N];

st.global.u32	[%r1], %r2;

exit;

}

.entry TestRN (

	.param .u32 __results,	// Float*

	.param .u32 __N		// N*

)

{

.reg .u32 %r<5>;

.reg .f32 %f32_<5>;

.reg .pred %p<3>;

// Set up

ld.param.u32	%r0,[__results];

mov.u32		%r2, 0;

// rn

mov.f32		%f32_1, 1.0;

again_rz:

mov.f32		%f32_0, 1.0;

div.full.f32	%f32_1, %f32_1, 2.0;

add.rn.f32	%f32_3, %f32_0, %f32_1;

st.global.f32	[%r0], %f32_3;

add.u32		%r2, %r2, 1;

add.u32		%r0, %r0, 4;

setp.ne.f32	%p0, %f32_3, %f32_0;

@%p0 bra again_rz;

end:

// Pass back the number of tests performed.

ld.param.u32	%r1,[__N];

st.global.u32	[%r1], %r2;

exit;

}

.entry TestRM (

	.param .u32 __results,	// Float*

	.param .u32 __N		// N*

)

{

.reg .u32 %r<5>;

.reg .f32 %f32_<5>;

.reg .pred %p<3>;

// Set up

ld.param.u32	%r0,[__results];

mov.u32		%r2, 0;

// rm

mov.f32		%f32_1, 1.0;

again_rz:

mov.f32		%f32_0, 1.0;

div.full.f32	%f32_1, %f32_1, 2.0;

add.rm.f32	%f32_3, %f32_0, %f32_1;

st.global.f32	[%r0], %f32_3;

add.u32		%r2, %r2, 1;

add.u32		%r0, %r0, 4;

setp.ne.f32	%p0, %f32_3, %f32_0;

@%p0 bra again_rz;

// see if it retains precision in a register, and I can subtract it.

mov.f32		%f32_1, 1.0;

mov.u32		%r3, 130;		// Only go to 130 bin decimal places.

again:

mov.f32		%f32_0, 1.0;

div.full.f32	%f32_1, %f32_1, 2.0;

add.rm.f32	%f32_3, %f32_0, %f32_1;

st.global.f32	[%r0], %f32_3;

add.u32		%r2, %r2, 1;

add.u32		%r0, %r0, 4;

setp.ne.u32	%p0, %r3, %r2;

@%p0 bra again;

sub.rp.f32	%f32_3, %f32_3, %f32_1;

setp.eq.f32	%p0, %f32_3, %f32_0;

@%p0 bra equal;

st.global.f32	[%r0], 99.0;

add.u32		%r2, %r2, 1;

add.u32		%r0, %r0, 4;

bra		end;

equal:

st.global.f32	[%r0], 101.0;

add.u32		%r2, %r2, 1;

add.u32		%r0, %r0, 4;

end:

// Pass back the number of tests performed.

ld.param.u32	%r1,[__N];

st.global.u32	[%r1], %r2;

exit;

}

[/codebox]

Output:

[codebox]# ./ptx-inst.exe

0 1.500000 3fc00000

1 1.250000 3fa00000

2 1.125000 3f900000

3 1.062500 3f880000

4 1.031250 3f840000

5 1.015625 3f820000

6 1.007813 3f810000

7 1.003906 3f808000

8 1.001953 3f804000

9 1.000977 3f802000

10 1.000488 3f801000

11 1.000244 3f800800

12 1.000122 3f800400

13 1.000061 3f800200

14 1.000031 3f800100

15 1.000015 3f800080

16 1.000008 3f800040

17 1.000004 3f800020

18 1.000002 3f800010

19 1.000001 3f800008

20 1.000000 3f800004

21 1.000000 3f800002

22 1.000000 3f800001

23 1.000000 3f800001

24 1.000000 3f800001

25 1.000000 3f800001

26 1.000000 3f800001

27 1.000000 3f800001

28 1.000000 3f800001

29 1.000000 3f800001

30 1.000000 3f800001

31 1.000000 3f800001

32 1.000000 3f800001

33 1.000000 3f800001

34 1.000000 3f800001

35 1.000000 3f800001

36 1.000000 3f800001

37 1.000000 3f800001

38 1.000000 3f800001

39 1.000000 3f800001

40 1.000000 3f800001

41 1.000000 3f800001

42 1.000000 3f800001

43 1.000000 3f800001

44 1.000000 3f800001

45 1.000000 3f800001

46 1.000000 3f800001

47 1.000000 3f800001

48 1.000000 3f800001

49 1.000000 3f800001

50 1.000000 3f800001

51 1.000000 3f800001

52 1.000000 3f800001

53 1.000000 3f800001

54 1.000000 3f800001

55 1.000000 3f800001

56 1.000000 3f800001

57 1.000000 3f800001

58 1.000000 3f800001

59 1.000000 3f800001

60 1.000000 3f800001

61 1.000000 3f800001

62 1.000000 3f800001

63 1.000000 3f800001

64 1.000000 3f800001

65 1.000000 3f800001

66 1.000000 3f800001

67 1.000000 3f800001

68 1.000000 3f800001

69 1.000000 3f800001

70 1.000000 3f800001

71 1.000000 3f800001

72 1.000000 3f800001

73 1.000000 3f800001

74 1.000000 3f800001

75 1.000000 3f800001

76 1.000000 3f800001

77 1.000000 3f800001

78 1.000000 3f800001

79 1.000000 3f800001

80 1.000000 3f800001

81 1.000000 3f800001

82 1.000000 3f800001

83 1.000000 3f800001

84 1.000000 3f800001

85 1.000000 3f800001

86 1.000000 3f800001

87 1.000000 3f800001

88 1.000000 3f800001

89 1.000000 3f800001

90 1.000000 3f800001

91 1.000000 3f800001

92 1.000000 3f800001

93 1.000000 3f800001

94 1.000000 3f800001

95 1.000000 3f800001

96 1.000000 3f800001

97 1.000000 3f800001

98 1.000000 3f800001

99 1.000000 3f800001

100 1.000000 3f800001

101 1.000000 3f800001

102 1.000000 3f800001

103 1.000000 3f800001

104 1.000000 3f800001

105 1.000000 3f800001

106 1.000000 3f800001

107 1.000000 3f800001

108 1.000000 3f800001

109 1.000000 3f800001

110 1.000000 3f800001

111 1.000000 3f800001

112 1.000000 3f800001

113 1.000000 3f800001

114 1.000000 3f800001

115 1.000000 3f800001

116 1.000000 3f800001

117 1.000000 3f800001

118 1.000000 3f800001

119 1.000000 3f800001

120 1.000000 3f800001

121 1.000000 3f800001

122 1.000000 3f800001

123 1.000000 3f800001

124 1.000000 3f800001

125 1.000000 3f800001

126 1.000000 3f800001

127 1.000000 3f800001

128 1.000000 3f800001

129 1.000000 3f800001

130 1.000000 3f800001

131 1.000000 3f800001

132 1.000000 3f800001

133 1.000000 3f800001

134 1.000000 3f800001

135 1.000000 3f800001

136 1.000000 3f800001

137 1.000000 3f800001

138 1.000000 3f800001

139 1.000000 3f800001

140 1.000000 3f800001

141 1.000000 3f800001

142 1.000000 3f800001

143 1.000000 3f800001

144 1.000000 3f800001

145 1.000000 3f800001

146 1.000000 3f800001

147 1.000000 3f800001

148 1.000000 3f800001

149 1.000000 3f800000

0 1.500000 3fc00000

1 1.250000 3fa00000

2 1.125000 3f900000

3 1.062500 3f880000

4 1.031250 3f840000

5 1.015625 3f820000

6 1.007813 3f810000

7 1.003906 3f808000

8 1.001953 3f804000

9 1.000977 3f802000

10 1.000488 3f801000

11 1.000244 3f800800

12 1.000122 3f800400

13 1.000061 3f800200

14 1.000031 3f800100

15 1.000015 3f800080

16 1.000008 3f800040

17 1.000004 3f800020

18 1.000002 3f800010

19 1.000001 3f800008

20 1.000000 3f800004

21 1.000000 3f800002

22 1.000000 3f800001

23 1.000000 3f800000

0 1.500000 3fc00000

1 1.250000 3fa00000

2 1.125000 3f900000

3 1.062500 3f880000

4 1.031250 3f840000

5 1.015625 3f820000

6 1.007813 3f810000

7 1.003906 3f808000

8 1.001953 3f804000

9 1.000977 3f802000

10 1.000488 3f801000

11 1.000244 3f800800

12 1.000122 3f800400

13 1.000061 3f800200

14 1.000031 3f800100

15 1.000015 3f800080

16 1.000008 3f800040

17 1.000004 3f800020

18 1.000002 3f800010

19 1.000001 3f800008

20 1.000000 3f800004

21 1.000000 3f800002

22 1.000000 3f800001

23 1.000000 3f800000

0 1.500000 3fc00000

1 1.250000 3fa00000

2 1.125000 3f900000

3 1.062500 3f880000

4 1.031250 3f840000

5 1.015625 3f820000

6 1.007813 3f810000

7 1.003906 3f808000

8 1.001953 3f804000

9 1.000977 3f802000

10 1.000488 3f801000

11 1.000244 3f800800

12 1.000122 3f800400

13 1.000061 3f800200

14 1.000031 3f800100

15 1.000015 3f800080

16 1.000008 3f800040

17 1.000004 3f800020

18 1.000002 3f800010

19 1.000001 3f800008

20 1.000000 3f800004

21 1.000000 3f800002

22 1.000000 3f800001

23 1.000000 3f800000

24 1.500000 3fc00000

25 1.250000 3fa00000

26 1.125000 3f900000

27 1.062500 3f880000

28 1.031250 3f840000

29 1.015625 3f820000

30 1.007813 3f810000

31 1.003906 3f808000

32 1.001953 3f804000

33 1.000977 3f802000

34 1.000488 3f801000

35 1.000244 3f800800

36 1.000122 3f800400

37 1.000061 3f800200

38 1.000031 3f800100

39 1.000015 3f800080

40 1.000008 3f800040

41 1.000004 3f800020

42 1.000002 3f800010

43 1.000001 3f800008

44 1.000000 3f800004

45 1.000000 3f800002

46 1.000000 3f800001

47 1.000000 3f800000

48 1.000000 3f800000

49 1.000000 3f800000

50 1.000000 3f800000

51 1.000000 3f800000

52 1.000000 3f800000

53 1.000000 3f800000

54 1.000000 3f800000

55 1.000000 3f800000

56 1.000000 3f800000

57 1.000000 3f800000

58 1.000000 3f800000

59 1.000000 3f800000

60 1.000000 3f800000

61 1.000000 3f800000

62 1.000000 3f800000

63 1.000000 3f800000

64 1.000000 3f800000

65 1.000000 3f800000

66 1.000000 3f800000

67 1.000000 3f800000

68 1.000000 3f800000

69 1.000000 3f800000

70 1.000000 3f800000

71 1.000000 3f800000

72 1.000000 3f800000

73 1.000000 3f800000

74 1.000000 3f800000

75 1.000000 3f800000

76 1.000000 3f800000

77 1.000000 3f800000

78 1.000000 3f800000

79 1.000000 3f800000

80 1.000000 3f800000

81 1.000000 3f800000

82 1.000000 3f800000

83 1.000000 3f800000

84 1.000000 3f800000

85 1.000000 3f800000

86 1.000000 3f800000

87 1.000000 3f800000

88 1.000000 3f800000

89 1.000000 3f800000

90 1.000000 3f800000

91 1.000000 3f800000

92 1.000000 3f800000

93 1.000000 3f800000

94 1.000000 3f800000

95 1.000000 3f800000

96 1.000000 3f800000

97 1.000000 3f800000

98 1.000000 3f800000

99 1.000000 3f800000

100 1.000000 3f800000

101 1.000000 3f800000

102 1.000000 3f800000

103 1.000000 3f800000

104 1.000000 3f800000

105 1.000000 3f800000

106 1.000000 3f800000

107 1.000000 3f800000

108 1.000000 3f800000

109 1.000000 3f800000

110 1.000000 3f800000

111 1.000000 3f800000

112 1.000000 3f800000

113 1.000000 3f800000

114 1.000000 3f800000

115 1.000000 3f800000

116 1.000000 3f800000

117 1.000000 3f800000

118 1.000000 3f800000

119 1.000000 3f800000

120 1.000000 3f800000

121 1.000000 3f800000

122 1.000000 3f800000

123 1.000000 3f800000

124 1.000000 3f800000

125 1.000000 3f800000

126 1.000000 3f800000

127 1.000000 3f800000

128 1.000000 3f800000

129 1.000000 3f800000

130 101.000000 42ca0000

[/codebox]

Rounding modes describe what you want the implementation to do when the exact answer of an operation does not fit inside the destination format.

In your example, when n=24, 1+1/2^24 is:

1.000000000000000000000001

The two floating-point numbers closest to the exact answer are 1 and 1+2^-23:

1.00000000000000000000000

1.00000000000000000000001

Which one would you choose as an approximation? It depends on the rounding mode.

  • Rounding “toward negative” (rn) or “toward zero” (rz) will yield the lower bound 1.

  • Rounding “toward positive” (rp) will yield the upper bound 1+2^-23.

  • For Rounding to the nearest: here we have a draw (both FP numbers are equally distant to the exact answer). By convention we select the FP number whose least-significant bit is zero (1.00000000000000000000000).

You eventually reach 1 in RP mode when n=150 because 2^-n cannot be represented as a single-precision floating-point number. Computing it causes an underflow and yields 0 (because your division by 2 uses the default RN mode). So you are actually computing 1+0, which returns 1 as you’d expect.

The behavior of rounding modes on Fermi, or Tesla is no different than on any IEEE-754-compliant CPU. See for instance fesetround() and fegetround() in the C standard library.

Rounding modes describe what you want the implementation to do when the exact answer of an operation does not fit inside the destination format.

In your example, when n=24, 1+1/2^24 is:

1.000000000000000000000001

The two floating-point numbers closest to the exact answer are 1 and 1+2^-23:

1.00000000000000000000000

1.00000000000000000000001

Which one would you choose as an approximation? It depends on the rounding mode.

  • Rounding “toward negative” (rn) or “toward zero” (rz) will yield the lower bound 1.

  • Rounding “toward positive” (rp) will yield the upper bound 1+2^-23.

  • For Rounding to the nearest: here we have a draw (both FP numbers are equally distant to the exact answer). By convention we select the FP number whose least-significant bit is zero (1.00000000000000000000000).

You eventually reach 1 in RP mode when n=150 because 2^-n cannot be represented as a single-precision floating-point number. Computing it causes an underflow and yields 0 (because your division by 2 uses the default RN mode). So you are actually computing 1+0, which returns 1 as you’d expect.

The behavior of rounding modes on Fermi, or Tesla is no different than on any IEEE-754-compliant CPU. See for instance fesetround() and fegetround() in the C standard library.

Hi All,

I’m back on this problem with precision and floats in CUDA. In my previous post, I was computing a value that could not be represented in IEEE 754-2008 32-bit floats. That produced a NaN, which explains the results I saw. Sylvain Collange, thanks for pointing that out.

However, I now stumbled on an inconsistency in SETP instruction that I cannot explain, and I specifically avoid NaN values. This is very unusual, since the GPU should produce consistent, understandable results.

In my example, I compare two numbers, 1.0 (=0x3f800000) and a number that is 1.0 plus a very small fraction (= 0x3f800001), where fraction = 1/2^50. Note that fraction is 8.8e-16, and is not NaN with 32-bit floats. The number “1.0 plus fraction” is produced two ways. The first way is by computation, 1.0 + 1/2^50 using add.rp.f32. I use TESTP to verify the result of the addition is normal and NaN.

[codebox] // First, create a very small number in register %fraction = 1/2^50.

mov.f32		%fraction, 1.0;

mov.u32		%r0, 0;

mov.u32		%r1, 50;

again:

div.full.f32	%fraction, %fraction, 2.0;

add.u32		%r0, %r0, 1;

setp.ne.u32	%p0, %r0, %r1;

@%p0 bra again;

// Add fraction to 1.0, using "rp" rounding.

// Note, 1.0 = 0x3f800000.

// The result should be 0x3f800001.

mov.f32		%one, 1.0;

add.rp.f32	%one_plus_fraction, %one, %fraction;

[/codebox]

The second method is by loading the hexidecimal equivalent these numbers values into two registers.

[codebox] // Cerate hardwired 1.0, and 1.0 + 1/2^50.

mov.f32		%hardwire_one, 0f3f800000;

mov.f32		%hardwire_one_plus_fraction, 0f3f800001;

[/codebox]

I first compare the values hardwire_one and hardwire_one_plus_fraction using the SETP instruction.

[codebox] // Compare hardwired values.

setp.ne.f32	%cmp_ne_f32_hardwire, %hardwire_one, %hardwire_one_plus_fraction;

setp.ne.b32	%cmp_ne_b32_hardwire, %hardwire_one, %hardwire_one_plus_fraction;

[/codebox]

The values of the two “%cmp_ne_…32_hardwire” registers are both true. In other words, the two numbers are NOT EQUAL, whether compared using .f32 or .b32. This makes sense because if I set up a similar comparison on the CPU, the two numbers are different. And, the bit patterns are different.

Next I compare the two numbers one and one_plus_fraction (the computed value).

[codebox] // Compare using the SETP instruction directly using the f32 registers.

setp.ne.f32	%cmp_ne_f32_directly, %one, %one_plus_fraction;

setp.ne.b32	%cmp_ne_b32_directly, %one, %one_plus_fraction;

[/codebox]

The values of these two “%cmp_ne_…32_directly” comparisons are false! In other words, SETP says two number are THE SAME, whether compared using .f32 or .b32. While I can see that the two numbers are approximately equal give the guard bit (beta = 2, p = 24), the comparison using B32 does not show the bit values are different, when they actual should be different.

Now, you may think that I’m off my rocker: the bit values of the two numbers one and one_plus_fraction are actually the same, because that’s what the GPU says. But, I passed these 32-bit values back to my calling C program via global memory, and verified that the binary values are indeed different, 0x3f800000 and 0x3f800001. In addition, I decided to reload these values from global space into fresh, clean .f32 registers on the GPU. The SETP instruction now says the values are DIFFERENT!!! This is inconsistent. Does anyone have an idea why the GPU is working this way?

(Attached is the source for this little exercise.)

Ken D.
ptx_inst.zip (6.1 KB)

Hi All,

I’m back on this problem with precision and floats in CUDA. In my previous post, I was computing a value that could not be represented in IEEE 754-2008 32-bit floats. That produced a NaN, which explains the results I saw. Sylvain Collange, thanks for pointing that out.

However, I now stumbled on an inconsistency in SETP instruction that I cannot explain, and I specifically avoid NaN values. This is very unusual, since the GPU should produce consistent, understandable results.

In my example, I compare two numbers, 1.0 (=0x3f800000) and a number that is 1.0 plus a very small fraction (= 0x3f800001), where fraction = 1/2^50. Note that fraction is 8.8e-16, and is not NaN with 32-bit floats. The number “1.0 plus fraction” is produced two ways. The first way is by computation, 1.0 + 1/2^50 using add.rp.f32. I use TESTP to verify the result of the addition is normal and NaN.

[codebox] // First, create a very small number in register %fraction = 1/2^50.

mov.f32		%fraction, 1.0;

mov.u32		%r0, 0;

mov.u32		%r1, 50;

again:

div.full.f32	%fraction, %fraction, 2.0;

add.u32		%r0, %r0, 1;

setp.ne.u32	%p0, %r0, %r1;

@%p0 bra again;

// Add fraction to 1.0, using "rp" rounding.

// Note, 1.0 = 0x3f800000.

// The result should be 0x3f800001.

mov.f32		%one, 1.0;

add.rp.f32	%one_plus_fraction, %one, %fraction;

[/codebox]

The second method is by loading the hexidecimal equivalent these numbers values into two registers.

[codebox] // Cerate hardwired 1.0, and 1.0 + 1/2^50.

mov.f32		%hardwire_one, 0f3f800000;

mov.f32		%hardwire_one_plus_fraction, 0f3f800001;

[/codebox]

I first compare the values hardwire_one and hardwire_one_plus_fraction using the SETP instruction.

[codebox] // Compare hardwired values.

setp.ne.f32	%cmp_ne_f32_hardwire, %hardwire_one, %hardwire_one_plus_fraction;

setp.ne.b32	%cmp_ne_b32_hardwire, %hardwire_one, %hardwire_one_plus_fraction;

[/codebox]

The values of the two “%cmp_ne_…32_hardwire” registers are both true. In other words, the two numbers are NOT EQUAL, whether compared using .f32 or .b32. This makes sense because if I set up a similar comparison on the CPU, the two numbers are different. And, the bit patterns are different.

Next I compare the two numbers one and one_plus_fraction (the computed value).

[codebox] // Compare using the SETP instruction directly using the f32 registers.

setp.ne.f32	%cmp_ne_f32_directly, %one, %one_plus_fraction;

setp.ne.b32	%cmp_ne_b32_directly, %one, %one_plus_fraction;

[/codebox]

The values of these two “%cmp_ne_…32_directly” comparisons are false! In other words, SETP says two number are THE SAME, whether compared using .f32 or .b32. While I can see that the two numbers are approximately equal give the guard bit (beta = 2, p = 24), the comparison using B32 does not show the bit values are different, when they actual should be different.

Now, you may think that I’m off my rocker: the bit values of the two numbers one and one_plus_fraction are actually the same, because that’s what the GPU says. But, I passed these 32-bit values back to my calling C program via global memory, and verified that the binary values are indeed different, 0x3f800000 and 0x3f800001. In addition, I decided to reload these values from global space into fresh, clean .f32 registers on the GPU. The SETP instruction now says the values are DIFFERENT!!! This is inconsistent. Does anyone have an idea why the GPU is working this way?

(Attached is the source for this little exercise.)

Ken D.

Confirmed with CUDA 3.2RC (on Linux 64). This looks like a compiler bug (likely in a constant propagation phase of ptxas).

Good catch!

Here is the disassembly with nv50dis:

--> Disassembling kernel  SetpFloats with nvc0dis

00000000: 2800440400005d04 l3 mov b32 $r1 c1[0x100]

00000008: 18fe00000001dde2 mov b32 $r7 0x3f800000

00000010: 28004000a0015d04 l3 mov b32 $r5 c0[0x28]

00000018: 189a000000009de2 mov b32 $r2 0x26800000

00000020: 18fe000004021d02 ??? b32 $r8 $r0 0x3f800001 [unknown: 1800000000000100]

00000028: 900000000051dc85 st b32 wb g[$r5+0] $r7

00000030: 900000003051dc05 st u8 wb g[$r5+0xc] $r7

00000038: 5100c9a000701c00 add rp f32 $r0 $r7 0x26800000

00000040: 2800400090019d04 l3 mov b32 $r6 c0[0x24]

00000048: 9000000010509c85 st b32 wb g[$r5+0x4] $r2

00000050: 9000000040521c05 st u8 wb g[$r5+0x10] $r8

00000058: 9000000020501c85 st b32 wb g[$r5+0x8] $r0

00000060: 9000000010609c05 st u8 wb g[$r6+0x4] $r2

00000068: 9000000020601c85 st b32 wb g[$r6+0x8] $r0

00000070: 900000000061dc05 st u8 wb g[$r6+0] $r7

00000078: 900000003061dc85 st b32 wb g[$r6+0xc] $r7

00000080: 9000000040621c05 st u8 wb g[$r6+0x10] $r8

00000088: 8000000000511c85 ld b32 $r4 ca g[$r5+0]

00000090: 800000002050dc05 ld u8 $r3 ca g[$r5+0x8]

00000098: 8000000000509c85 ld b32 $r2 ca g[$r5+0]

000000a0: 8000000020501c05 ld u8 $r0 ca g[$r5+0x8]

000000a8: 2800400080019de4 mov b32 $r6 c0[0x20]

000000b0: 180000000401dd02 ??? b32 $r7 $r0 0x1 [unknown: 1800000000000100]

000000b8: 900000000861dc05 st u8 wb g[$r6+0x2] $r7

000000c0: 900000000c61dc05 st u8 wb g[$r6+0x3] $r7

000000c8: 900000001861dc05 st u8 wb g[$r6+0x6] $r7

000000d0: 900000002061dc05 st u8 wb g[$r6+0x8] $r7

000000d8: 90000000006fdc05 st u8 wb g[$r6+0] $r63

000000e0: 90000000046fdc05 st u8 wb g[$r6+0x1] $r63

000000e8: 900000001c6fdc05 st u8 wb g[$r6+0x7] $r63

000000f0: 90000000246fdc05 st u8 wb g[$r6+0x9] $r63

000000f8: 228e00000c41dc00 set $p0 ne f32 $r4 $r3

00000100: 1a8e00000023dc03 set $p1 ne u32 $r2 $r0

00000108: 2010c00007f01c04 selp b32 $r0 $r63 0x1 not $p0

00000110: 2012c00007f09c04 selp b32 $r2 $r63 0x1 not $p1

00000118: 9000000010601c05 st u8 wb g[$r6+0x4] $r0

00000120: 9000000014609c05 st u8 wb g[$r6+0x5] $r2

00000128: 8000000000001de7 exit

Instructions at 000000d8 and 000000e0 store directly the precomputed value 0 at __B[0] and __B[1]. This is wrong, the expected result is 1.

Just to nitpick, it was an underflow which produced a zero as a result of the division, rather than a NaN. NaNs are used for mathematically undefined operations, such as 0×Infinity or square root of a negative number.

Confirmed with CUDA 3.2RC (on Linux 64). This looks like a compiler bug (likely in a constant propagation phase of ptxas).

Good catch!

Here is the disassembly with nv50dis:

--> Disassembling kernel  SetpFloats with nvc0dis

00000000: 2800440400005d04 l3 mov b32 $r1 c1[0x100]

00000008: 18fe00000001dde2 mov b32 $r7 0x3f800000

00000010: 28004000a0015d04 l3 mov b32 $r5 c0[0x28]

00000018: 189a000000009de2 mov b32 $r2 0x26800000

00000020: 18fe000004021d02 ??? b32 $r8 $r0 0x3f800001 [unknown: 1800000000000100]

00000028: 900000000051dc85 st b32 wb g[$r5+0] $r7

00000030: 900000003051dc05 st u8 wb g[$r5+0xc] $r7

00000038: 5100c9a000701c00 add rp f32 $r0 $r7 0x26800000

00000040: 2800400090019d04 l3 mov b32 $r6 c0[0x24]

00000048: 9000000010509c85 st b32 wb g[$r5+0x4] $r2

00000050: 9000000040521c05 st u8 wb g[$r5+0x10] $r8

00000058: 9000000020501c85 st b32 wb g[$r5+0x8] $r0

00000060: 9000000010609c05 st u8 wb g[$r6+0x4] $r2

00000068: 9000000020601c85 st b32 wb g[$r6+0x8] $r0

00000070: 900000000061dc05 st u8 wb g[$r6+0] $r7

00000078: 900000003061dc85 st b32 wb g[$r6+0xc] $r7

00000080: 9000000040621c05 st u8 wb g[$r6+0x10] $r8

00000088: 8000000000511c85 ld b32 $r4 ca g[$r5+0]

00000090: 800000002050dc05 ld u8 $r3 ca g[$r5+0x8]

00000098: 8000000000509c85 ld b32 $r2 ca g[$r5+0]

000000a0: 8000000020501c05 ld u8 $r0 ca g[$r5+0x8]

000000a8: 2800400080019de4 mov b32 $r6 c0[0x20]

000000b0: 180000000401dd02 ??? b32 $r7 $r0 0x1 [unknown: 1800000000000100]

000000b8: 900000000861dc05 st u8 wb g[$r6+0x2] $r7

000000c0: 900000000c61dc05 st u8 wb g[$r6+0x3] $r7

000000c8: 900000001861dc05 st u8 wb g[$r6+0x6] $r7

000000d0: 900000002061dc05 st u8 wb g[$r6+0x8] $r7

000000d8: 90000000006fdc05 st u8 wb g[$r6+0] $r63

000000e0: 90000000046fdc05 st u8 wb g[$r6+0x1] $r63

000000e8: 900000001c6fdc05 st u8 wb g[$r6+0x7] $r63

000000f0: 90000000246fdc05 st u8 wb g[$r6+0x9] $r63

000000f8: 228e00000c41dc00 set $p0 ne f32 $r4 $r3

00000100: 1a8e00000023dc03 set $p1 ne u32 $r2 $r0

00000108: 2010c00007f01c04 selp b32 $r0 $r63 0x1 not $p0

00000110: 2012c00007f09c04 selp b32 $r2 $r63 0x1 not $p1

00000118: 9000000010601c05 st u8 wb g[$r6+0x4] $r0

00000120: 9000000014609c05 st u8 wb g[$r6+0x5] $r2

00000128: 8000000000001de7 exit

Instructions at 000000d8 and 000000e0 store directly the precomputed value 0 at __B[0] and __B[1]. This is wrong, the expected result is 1.

Just to nitpick, it was an underflow which produced a zero as a result of the division, rather than a NaN. NaNs are used for mathematically undefined operations, such as 0×Infinity or square root of a negative number.

Sorry, I am a bit lost as to what we are trying to figure out here. The .rn, .rz, .rp, .rm suffixes correspond to the standard four IEEE-754 rounding modes round-to-nearest-or-even, round-towards-zero, round-towards-plus-infinity, round-towards-minus-infinity. On sm_20 hardware, these rounding modes are supported for single-precision adds and multiplies, among other operations. If we add increasingly small, non-zero, positive fractions to 1.0f, we expect the result to eventually become 1.0f (0x3f800000) for .rn, .rz, and .rm. We expect the result to become 1.0f + 1ulp (0x3f800001) for .rp. When I tried the .rp case mentioned above with the program below, things work just fine. The program prints: result = 1.000000119e+000 (3f800001) on my C2050 machine. [NOTE: inline PTX is not a supported feature].

CUDA offers device functions (intrinsics) that give access to basic arithmetic operations as well as various conversions with all four IEEE rounding modes, and I would highly recommend using these device functions instead of using PTX. For single-precision addition the relevant device functions are:

__fadd_rn() // corresponds to add.rn.f32, “n = nearest”

__fadd_rz() // corresponds to add.rz.f32, “z = zero”

__fadd_rd() // corresponds to add.rm.f32, “d = down”

__fadd_ru() // corresponds to add.rp.f32, “u = up”

[codebox]#include <stdio.h>

#include <stdlib.h>

// Macro to catch CUDA errors in CUDA runtime calls

#define CUDA_SAFE_CALL(call) \

do { \

cudaError_t err = call;                                           \

if (cudaSuccess != err) {                                         \

    fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\

             __FILE__, __LINE__, cudaGetErrorString(err) );       \

    exit(EXIT_FAILURE);                                           \

}                                                                 \

} while (0)

// Macro to catch CUDA errors in kernel launches

#define CHECK_LAUNCH_ERROR() \

do { \

/* Check synchronous errors, i.e. pre-launch */                   \

cudaError_t err = cudaGetLastError();                             \

if (cudaSuccess != err) {                                         \

    fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\

             __FILE__, __LINE__, cudaGetErrorString(err) );       \

    exit(EXIT_FAILURE);                                           \

}                                                                 \

/* Check asynchronous errors, i.e. kernel failed (ULF) */         \

err = cudaThreadSynchronize();                                    \

if (cudaSuccess != err) {                                         \

    fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\

             __FILE__, __LINE__, cudaGetErrorString( err) );      \

    exit(EXIT_FAILURE);                                           \

}                                                                 \

} while (0)

static unsigned floatAsUInt(float x)

{

volatile union {

    float f;

    unsigned i;

} xx;

xx.f = x;

return xx.i;

}

device forceinline float fadd_rp (void)

{

float res;

asm (" .reg .u32 iter, maxiter;\n"

     "        .reg .f32 fraction, one, one_plus_fraction;\n"

     "        .reg .pred noteq;\n"

     "        mov.f32         fraction, 1.0;\n"

     "        mov.u32         iter, 0;\n"

     "        mov.u32         maxiter, 50;\n"

     "$again: div.full.f32	  fraction, fraction, 2.0;\n"

     "        add.u32	  iter, iter, 1;\n"

     "        setp.ne.u32     noteq, iter, maxiter;\n"

     "@noteq  bra             $again;\n"

 "        mov.f32	  one, 1.0;\n"

     "        add.rp.f32	  one_plus_fraction, one, fraction;\n"

     "        mov.f32         %0, one_plus_fraction;\n"

     : "=f"(res) 

     : );

return res;

}

global void fadd_rp_kernel (float *result)

{

*result = fadd_rp();

}

int main (void)

{

float res;

float *res_d;

CUDA_SAFE_CALL (cudaMalloc ((void**)&res_d, sizeof(res_d[0])));

fadd_rp_kernel<<<1,1>>>(res_d);

CHECK_LAUNCH_ERROR();

CUDA_SAFE_CALL (cudaMemcpy (&res, res_d, sizeof(res_d[0]), 

                            cudaMemcpyDeviceToHost));

CUDA_SAFE_CALL (cudaFree (res_d));

printf ("result = %16.9e (%08x)\n", res, floatAsUInt(res));

return EXIT_SUCCESS;

}[/codebox]

Sorry, I am a bit lost as to what we are trying to figure out here. The .rn, .rz, .rp, .rm suffixes correspond to the standard four IEEE-754 rounding modes round-to-nearest-or-even, round-towards-zero, round-towards-plus-infinity, round-towards-minus-infinity. On sm_20 hardware, these rounding modes are supported for single-precision adds and multiplies, among other operations. If we add increasingly small, non-zero, positive fractions to 1.0f, we expect the result to eventually become 1.0f (0x3f800000) for .rn, .rz, and .rm. We expect the result to become 1.0f + 1ulp (0x3f800001) for .rp. When I tried the .rp case mentioned above with the program below, things work just fine. The program prints: result = 1.000000119e+000 (3f800001) on my C2050 machine. [NOTE: inline PTX is not a supported feature].

CUDA offers device functions (intrinsics) that give access to basic arithmetic operations as well as various conversions with all four IEEE rounding modes, and I would highly recommend using these device functions instead of using PTX. For single-precision addition the relevant device functions are:

__fadd_rn() // corresponds to add.rn.f32, “n = nearest”

__fadd_rz() // corresponds to add.rz.f32, “z = zero”

__fadd_rd() // corresponds to add.rm.f32, “d = down”

__fadd_ru() // corresponds to add.rp.f32, “u = up”

[codebox]#include <stdio.h>

#include <stdlib.h>

// Macro to catch CUDA errors in CUDA runtime calls

#define CUDA_SAFE_CALL(call) \

do { \

cudaError_t err = call;                                           \

if (cudaSuccess != err) {                                         \

    fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\

             __FILE__, __LINE__, cudaGetErrorString(err) );       \

    exit(EXIT_FAILURE);                                           \

}                                                                 \

} while (0)

// Macro to catch CUDA errors in kernel launches

#define CHECK_LAUNCH_ERROR() \

do { \

/* Check synchronous errors, i.e. pre-launch */                   \

cudaError_t err = cudaGetLastError();                             \

if (cudaSuccess != err) {                                         \

    fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\

             __FILE__, __LINE__, cudaGetErrorString(err) );       \

    exit(EXIT_FAILURE);                                           \

}                                                                 \

/* Check asynchronous errors, i.e. kernel failed (ULF) */         \

err = cudaThreadSynchronize();                                    \

if (cudaSuccess != err) {                                         \

    fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\

             __FILE__, __LINE__, cudaGetErrorString( err) );      \

    exit(EXIT_FAILURE);                                           \

}                                                                 \

} while (0)

static unsigned floatAsUInt(float x)

{

volatile union {

    float f;

    unsigned i;

} xx;

xx.f = x;

return xx.i;

}

device forceinline float fadd_rp (void)

{

float res;

asm (" .reg .u32 iter, maxiter;\n"

     "        .reg .f32 fraction, one, one_plus_fraction;\n"

     "        .reg .pred noteq;\n"

     "        mov.f32         fraction, 1.0;\n"

     "        mov.u32         iter, 0;\n"

     "        mov.u32         maxiter, 50;\n"

     "$again: div.full.f32	  fraction, fraction, 2.0;\n"

     "        add.u32	  iter, iter, 1;\n"

     "        setp.ne.u32     noteq, iter, maxiter;\n"

     "@noteq  bra             $again;\n"

 "        mov.f32	  one, 1.0;\n"

     "        add.rp.f32	  one_plus_fraction, one, fraction;\n"

     "        mov.f32         %0, one_plus_fraction;\n"

     : "=f"(res) 

     : );

return res;

}

global void fadd_rp_kernel (float *result)

{

*result = fadd_rp();

}

int main (void)

{

float res;

float *res_d;

CUDA_SAFE_CALL (cudaMalloc ((void**)&res_d, sizeof(res_d[0])));

fadd_rp_kernel<<<1,1>>>(res_d);

CHECK_LAUNCH_ERROR();

CUDA_SAFE_CALL (cudaMemcpy (&res, res_d, sizeof(res_d[0]), 

                            cudaMemcpyDeviceToHost));

CUDA_SAFE_CALL (cudaFree (res_d));

printf ("result = %16.9e (%08x)\n", res, floatAsUInt(res));

return EXIT_SUCCESS;

}[/codebox]

It looks like I have a simple repro for the second issue discussed here, and it seems to be a problem inside PTXAS, as Sylvain diagnosed. I will file a compiler bug. If I compile the attached program with -Xptxas -O0 or -Xptxas -O1, it prints “one .ne. one_plus_fraction = 1”, as expected. However, if I compile with -Xptxas -O2 instead it prints “one .ne. one_plus_fraction = 0”. Thanks for bringing this issue to our attention.

[codebox]#include <stdio.h>

#include <stdlib.h>

// Macro to catch CUDA errors in CUDA runtime calls

#define CUDA_SAFE_CALL(call) \

do { \

cudaError_t err = call;                                           \

if (cudaSuccess != err) {                                         \

    fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\

             __FILE__, __LINE__, cudaGetErrorString(err) );       \

    exit(EXIT_FAILURE);                                           \

}                                                                 \

} while (0)

// Macro to catch CUDA errors in kernel launches

#define CHECK_LAUNCH_ERROR() \

do { \

/* Check synchronous errors, i.e. pre-launch */                   \

cudaError_t err = cudaGetLastError();                             \

if (cudaSuccess != err) {                                         \

    fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\

             __FILE__, __LINE__, cudaGetErrorString(err) );       \

    exit(EXIT_FAILURE);                                           \

}                                                                 \

/* Check asynchronous errors, i.e. kernel failed (ULF) */         \

err = cudaThreadSynchronize();                                    \

if (cudaSuccess != err) {                                         \

    fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\

             __FILE__, __LINE__, cudaGetErrorString( err) );      \

    exit(EXIT_FAILURE);                                           \

}                                                                 \

} while (0)

device forceinline int fadd_rp (void)

{

int res;

asm (".reg .u32 iter, maxiter;\n\t"

     ".reg .f32 fraction, one, one_plus_fraction;\n\t"

     ".reg .pred noteq;\n"

     "        mov.f32         fraction, 1.0;\n"

     "        mov.u32         iter, 0;\n"

     "        mov.u32         maxiter, 50;\n"

     "$again: div.full.f32    fraction, fraction, 2.0;\n"

     "        add.u32         iter, iter, 1;\n"

     "        setp.ne.u32     noteq, iter, maxiter;\n"

     "@noteq  bra             $again;\n"

     "        mov.f32         one, 1.0;\n"

     "        add.rp.f32      one_plus_fraction, one, fraction;\n"

     "        setp.ne.f32     noteq, one, one_plus_fraction;\n"

     "        selp.u32        %0, 1, 0, noteq;\n"

     : "=r"(res) 

     : );

return res;

}

global void fadd_rp_kernel (int *result)

{

*result = fadd_rp();

}

int main (void)

{

int res;

int *res_d;

CUDA_SAFE_CALL (cudaMalloc ((void**)&res_d, sizeof(res_d[0])));

fadd_rp_kernel<<<1,1>>>(res_d);

CHECK_LAUNCH_ERROR();

CUDA_SAFE_CALL (cudaMemcpy (&res, res_d, sizeof(res_d[0]), 

                            cudaMemcpyDeviceToHost));

CUDA_SAFE_CALL (cudaFree (res_d));

printf ("one .ne. one_plus_fraction = %d\n", res);

return EXIT_SUCCESS;

}[/codebox]

It looks like I have a simple repro for the second issue discussed here, and it seems to be a problem inside PTXAS, as Sylvain diagnosed. I will file a compiler bug. If I compile the attached program with -Xptxas -O0 or -Xptxas -O1, it prints “one .ne. one_plus_fraction = 1”, as expected. However, if I compile with -Xptxas -O2 instead it prints “one .ne. one_plus_fraction = 0”. Thanks for bringing this issue to our attention.

[codebox]#include <stdio.h>

#include <stdlib.h>

// Macro to catch CUDA errors in CUDA runtime calls

#define CUDA_SAFE_CALL(call) \

do { \

cudaError_t err = call;                                           \

if (cudaSuccess != err) {                                         \

    fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\

             __FILE__, __LINE__, cudaGetErrorString(err) );       \

    exit(EXIT_FAILURE);                                           \

}                                                                 \

} while (0)

// Macro to catch CUDA errors in kernel launches

#define CHECK_LAUNCH_ERROR() \

do { \

/* Check synchronous errors, i.e. pre-launch */                   \

cudaError_t err = cudaGetLastError();                             \

if (cudaSuccess != err) {                                         \

    fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\

             __FILE__, __LINE__, cudaGetErrorString(err) );       \

    exit(EXIT_FAILURE);                                           \

}                                                                 \

/* Check asynchronous errors, i.e. kernel failed (ULF) */         \

err = cudaThreadSynchronize();                                    \

if (cudaSuccess != err) {                                         \

    fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\

             __FILE__, __LINE__, cudaGetErrorString( err) );      \

    exit(EXIT_FAILURE);                                           \

}                                                                 \

} while (0)

device forceinline int fadd_rp (void)

{

int res;

asm (".reg .u32 iter, maxiter;\n\t"

     ".reg .f32 fraction, one, one_plus_fraction;\n\t"

     ".reg .pred noteq;\n"

     "        mov.f32         fraction, 1.0;\n"

     "        mov.u32         iter, 0;\n"

     "        mov.u32         maxiter, 50;\n"

     "$again: div.full.f32    fraction, fraction, 2.0;\n"

     "        add.u32         iter, iter, 1;\n"

     "        setp.ne.u32     noteq, iter, maxiter;\n"

     "@noteq  bra             $again;\n"

     "        mov.f32         one, 1.0;\n"

     "        add.rp.f32      one_plus_fraction, one, fraction;\n"

     "        setp.ne.f32     noteq, one, one_plus_fraction;\n"

     "        selp.u32        %0, 1, 0, noteq;\n"

     : "=r"(res) 

     : );

return res;

}

global void fadd_rp_kernel (int *result)

{

*result = fadd_rp();

}

int main (void)

{

int res;

int *res_d;

CUDA_SAFE_CALL (cudaMalloc ((void**)&res_d, sizeof(res_d[0])));

fadd_rp_kernel<<<1,1>>>(res_d);

CHECK_LAUNCH_ERROR();

CUDA_SAFE_CALL (cudaMemcpy (&res, res_d, sizeof(res_d[0]), 

                            cudaMemcpyDeviceToHost));

CUDA_SAFE_CALL (cudaFree (res_d));

printf ("one .ne. one_plus_fraction = %d\n", res);

return EXIT_SUCCESS;

}[/codebox]

Sorry for the confusion. I wasn’t sure what was the cause of the problem with SETP–the instruction or the operands–thus placed the follow-up message on this thread. I’m now much clearer how rounding of floats work in PTX. Thank you all for your explanations.

What is new to me is that PTXAS is not a translator that converts one PTX instruction into one machine instruction, but more like a compiler. After looking at the output of nv50dis, it’s just amazing to see how much of my PTX example was rewritten. The loop to compute %faction is gone! Since I’m writing tests for each PTX instruction I implement in my emulator, I’ll be sure to follow it up with a peek at the disassembled machine code from now on, if I see something unusual.

Ken D.

Sorry for the confusion. I wasn’t sure what was the cause of the problem with SETP–the instruction or the operands–thus placed the follow-up message on this thread. I’m now much clearer how rounding of floats work in PTX. Thank you all for your explanations.

What is new to me is that PTXAS is not a translator that converts one PTX instruction into one machine instruction, but more like a compiler. After looking at the output of nv50dis, it’s just amazing to see how much of my PTX example was rewritten. The loop to compute %faction is gone! Since I’m writing tests for each PTX instruction I implement in my emulator, I’ll be sure to follow it up with a peek at the disassembled machine code from now on, if I see something unusual.

Ken D.