Hi All,
I’m trying to figure out how the .rn, .rm, .rp, .rz rounding qualifiers work on floating points in PTX. The documentation of them is pretty sketchy (see the “PTX: Parallel Thread Execution” PDF, the ADD instruction for example). Why? I’ve written a CUDA emulator to run on Windows, and wanted to figure out how these strange rounding options really work, so I could try to do the same. So, I wrote a small PTX/CUDA driver program that tests the difference between 1 and 1+1/2^n for some positive integer n, using 32-bit floating point registers, and the ADD{.rnd}.F32 and SETP.EQ.F32 instructions. Eventually, for some n, I expect no difference between the two numbers because the mantissa can hold only 23 bits. Otherwise, why even offer these rounding options on instructions?
I ran my program on a GT 470 GPU. What I expected was to see no difference between 1 and 1+1/2^23 in a 32-bit floating point register. For .rn, .rm, .rz rounding, there was no difference, which is what I expected. (I may be off by 1, so please be kind.) But for .rp, it seems that registers retain a mantissa of 150 digits. While I can understand computing floating points with a higher precision than 32-bits within the floating point unit, then truncating the results to 32 bits, I didn’t expect to see that the GPU computes it to a 150-bit mantissa. And, especially to find that 32-bit f.p. registers are not actually 32-bits!! (Seeing is believing–look at the code.)
Am I correct? Is a 32-bit f.p. register really NOT 32 bits?! And is this IEEE 754-2008 floating point conforming? As you can tell, I’m not an IEEE 754-2008 expert, and I don’t have a copy of the spec. And, I don’t understand what the doc means by “Instructions that support rounding modifiers are IEEE-754 compliant. Double-precision instructions support subnormal inputs and results.” Is there any better discussion of floating point calculations on the Fermi?
Ken
Here’s my code (C++, and PTX) and output:
[codebox]#include <stdio.h>
#include <cuda.h>
void test(int x, const char * s)
{
int t = x;
if (t != 0)
{
std::cout << "fail " << t << " " << s << "\n";
std::cout.flush();
}
}
#define ALIGN_UP(offset, alignment) (offset) = ((offset) + (alignment) - 1) & ~((alignment) - 1)
void load_and_test_float(CUmodule cuModule, char * test_name)
{
try
{
CUfunction proc;
test(cuModuleGetFunction(&proc, cuModule, test_name), "cuModuleGetFunction");
int max = 1000;
float * h_R = (float*)malloc(max * sizeof(float));
memset(h_R, 0, max * sizeof(float));
CUdeviceptr d_R;
test(cuMemAlloc(&d_R, max * sizeof(float)), "cuMemAlloc");
test(cuMemcpyHtoD(d_R, h_R, sizeof(int)), "cuMemcpyHtoD");
CUdeviceptr d_N;
int h_N = 0;
test(cuMemAlloc(&d_N, sizeof(int)), "cuMemAlloc");
test(cuMemcpyHtoD(d_N, &h_N, sizeof(int)), "cuMemcpyHtoD");
int offset = 0;
void* ptr;
ptr = (void*)(size_t)d_R;
ALIGN_UP(offset, __alignof(ptr));
test(cuParamSetv(proc, offset, &ptr, sizeof(ptr)), "cuParamSetv");
offset += sizeof(ptr);
ptr = (void*)(size_t)d_N;
ALIGN_UP(offset, __alignof(ptr));
test(cuParamSetv(proc, offset, &ptr, sizeof(ptr)), "cuParamSetv");
offset += sizeof(ptr);
test(cuParamSetSize(proc, offset), "cuParamSetSize");
int threadsPerBlock = 1;
int blocksPerGrid = 1;
test(cuFuncSetBlockShape(proc, threadsPerBlock, 1, 1), "cuFuncSetBlockShape");
test(cuLaunchGrid(proc, blocksPerGrid, 1), "cuLaunchGrid");
test(cuMemcpyDtoH(h_R, d_R, max * sizeof(float)), "cuMemcpyDtoH");
test(cuMemcpyDtoH(&h_N, d_N, sizeof(int)), "cuMemcpyDtoH");
test(cuMemFree(d_R), "cuMemFree");
test(cuMemFree(d_N), "cuMemFree");
union FOO
{
float f;
int i;
} foo;
for (int i = 0; i < h_N; ++i)
{
foo.f = h_R[i];
printf("%d %f %x\n", i, h_R[i], foo.i);
}
}
catch (...)
{
std::string s = test_name;
s = s.append(" crashed.\n");
test(1, s.c_str());
}
}
int main(int argc, char *argv)
{
argc--; argv++;
test(cuInit(0), "cuInit");
int deviceCount = 0;
test(cuDeviceGetCount(&deviceCount), "cuDeviceGetCount");
int device = 0;
if (argc)
device = atoi(*argv);
CUdevice cuDevice = 0;
test(cuDeviceGet(&cuDevice, device), "cuDeviceGet");
CUcontext cuContext;
int xxx = cuCtxCreate(&cuContext, 0, cuDevice);
CUmodule cuModule;
test(cuModuleLoad(&cuModule, "inst.ptx"), "cuModuleLoad");
load_and_test_float(cuModule, "TestRP");
load_and_test_float(cuModule, "TestRZ");
load_and_test_float(cuModule, "TestRN");
load_and_test_float(cuModule, "TestRM");
return 0;
}
[/codebox]
[codebox] .version 2.1
.target sm_20
// .version 1.4
// .target sm_13
.entry TestRP (
.param .u32 __results, // Float*
.param .u32 __N // N*
)
{
.reg .u32 %r<5>;
.reg .f32 %f32_<5>;
.reg .pred %p<3>;
// Set up
ld.param.u32 %r0,[__results];
mov.u32 %r2, 0;
// rp
mov.f32 %f32_1, 1.0;
again_rp:
mov.f32 %f32_0, 1.0;
div.full.f32 %f32_1, %f32_1, 2.0;
add.rp.f32 %f32_3, %f32_0, %f32_1;
st.global.f32 [%r0], %f32_3;
add.u32 %r2, %r2, 1;
add.u32 %r0, %r0, 4;
setp.ne.f32 %p0, %f32_3, %f32_0;
@%p0 bra again_rp;
end:
// Pass back the number of tests performed.
ld.param.u32 %r1,[__N];
st.global.u32 [%r1], %r2;
exit;
}
.entry TestRZ (
.param .u32 __results, // Float*
.param .u32 __N // N*
)
{
.reg .u32 %r<5>;
.reg .f32 %f32_<5>;
.reg .pred %p<3>;
// Set up
ld.param.u32 %r0,[__results];
mov.u32 %r2, 0;
// rz
mov.f32 %f32_1, 1.0;
again_rz:
mov.f32 %f32_0, 1.0;
div.full.f32 %f32_1, %f32_1, 2.0;
add.rz.f32 %f32_3, %f32_0, %f32_1;
st.global.f32 [%r0], %f32_3;
add.u32 %r2, %r2, 1;
add.u32 %r0, %r0, 4;
setp.ne.f32 %p0, %f32_3, %f32_0;
@%p0 bra again_rz;
end:
// Pass back the number of tests performed.
ld.param.u32 %r1,[__N];
st.global.u32 [%r1], %r2;
exit;
}
.entry TestRN (
.param .u32 __results, // Float*
.param .u32 __N // N*
)
{
.reg .u32 %r<5>;
.reg .f32 %f32_<5>;
.reg .pred %p<3>;
// Set up
ld.param.u32 %r0,[__results];
mov.u32 %r2, 0;
// rn
mov.f32 %f32_1, 1.0;
again_rz:
mov.f32 %f32_0, 1.0;
div.full.f32 %f32_1, %f32_1, 2.0;
add.rn.f32 %f32_3, %f32_0, %f32_1;
st.global.f32 [%r0], %f32_3;
add.u32 %r2, %r2, 1;
add.u32 %r0, %r0, 4;
setp.ne.f32 %p0, %f32_3, %f32_0;
@%p0 bra again_rz;
end:
// Pass back the number of tests performed.
ld.param.u32 %r1,[__N];
st.global.u32 [%r1], %r2;
exit;
}
.entry TestRM (
.param .u32 __results, // Float*
.param .u32 __N // N*
)
{
.reg .u32 %r<5>;
.reg .f32 %f32_<5>;
.reg .pred %p<3>;
// Set up
ld.param.u32 %r0,[__results];
mov.u32 %r2, 0;
// rm
mov.f32 %f32_1, 1.0;
again_rz:
mov.f32 %f32_0, 1.0;
div.full.f32 %f32_1, %f32_1, 2.0;
add.rm.f32 %f32_3, %f32_0, %f32_1;
st.global.f32 [%r0], %f32_3;
add.u32 %r2, %r2, 1;
add.u32 %r0, %r0, 4;
setp.ne.f32 %p0, %f32_3, %f32_0;
@%p0 bra again_rz;
// see if it retains precision in a register, and I can subtract it.
mov.f32 %f32_1, 1.0;
mov.u32 %r3, 130; // Only go to 130 bin decimal places.
again:
mov.f32 %f32_0, 1.0;
div.full.f32 %f32_1, %f32_1, 2.0;
add.rm.f32 %f32_3, %f32_0, %f32_1;
st.global.f32 [%r0], %f32_3;
add.u32 %r2, %r2, 1;
add.u32 %r0, %r0, 4;
setp.ne.u32 %p0, %r3, %r2;
@%p0 bra again;
sub.rp.f32 %f32_3, %f32_3, %f32_1;
setp.eq.f32 %p0, %f32_3, %f32_0;
@%p0 bra equal;
st.global.f32 [%r0], 99.0;
add.u32 %r2, %r2, 1;
add.u32 %r0, %r0, 4;
bra end;
equal:
st.global.f32 [%r0], 101.0;
add.u32 %r2, %r2, 1;
add.u32 %r0, %r0, 4;
end:
// Pass back the number of tests performed.
ld.param.u32 %r1,[__N];
st.global.u32 [%r1], %r2;
exit;
}
[/codebox]
Output:
[codebox]# ./ptx-inst.exe
0 1.500000 3fc00000
1 1.250000 3fa00000
2 1.125000 3f900000
3 1.062500 3f880000
4 1.031250 3f840000
5 1.015625 3f820000
6 1.007813 3f810000
7 1.003906 3f808000
8 1.001953 3f804000
9 1.000977 3f802000
10 1.000488 3f801000
11 1.000244 3f800800
12 1.000122 3f800400
13 1.000061 3f800200
14 1.000031 3f800100
15 1.000015 3f800080
16 1.000008 3f800040
17 1.000004 3f800020
18 1.000002 3f800010
19 1.000001 3f800008
20 1.000000 3f800004
21 1.000000 3f800002
22 1.000000 3f800001
23 1.000000 3f800001
24 1.000000 3f800001
25 1.000000 3f800001
26 1.000000 3f800001
27 1.000000 3f800001
28 1.000000 3f800001
29 1.000000 3f800001
30 1.000000 3f800001
31 1.000000 3f800001
32 1.000000 3f800001
33 1.000000 3f800001
34 1.000000 3f800001
35 1.000000 3f800001
36 1.000000 3f800001
37 1.000000 3f800001
38 1.000000 3f800001
39 1.000000 3f800001
40 1.000000 3f800001
41 1.000000 3f800001
42 1.000000 3f800001
43 1.000000 3f800001
44 1.000000 3f800001
45 1.000000 3f800001
46 1.000000 3f800001
47 1.000000 3f800001
48 1.000000 3f800001
49 1.000000 3f800001
50 1.000000 3f800001
51 1.000000 3f800001
52 1.000000 3f800001
53 1.000000 3f800001
54 1.000000 3f800001
55 1.000000 3f800001
56 1.000000 3f800001
57 1.000000 3f800001
58 1.000000 3f800001
59 1.000000 3f800001
60 1.000000 3f800001
61 1.000000 3f800001
62 1.000000 3f800001
63 1.000000 3f800001
64 1.000000 3f800001
65 1.000000 3f800001
66 1.000000 3f800001
67 1.000000 3f800001
68 1.000000 3f800001
69 1.000000 3f800001
70 1.000000 3f800001
71 1.000000 3f800001
72 1.000000 3f800001
73 1.000000 3f800001
74 1.000000 3f800001
75 1.000000 3f800001
76 1.000000 3f800001
77 1.000000 3f800001
78 1.000000 3f800001
79 1.000000 3f800001
80 1.000000 3f800001
81 1.000000 3f800001
82 1.000000 3f800001
83 1.000000 3f800001
84 1.000000 3f800001
85 1.000000 3f800001
86 1.000000 3f800001
87 1.000000 3f800001
88 1.000000 3f800001
89 1.000000 3f800001
90 1.000000 3f800001
91 1.000000 3f800001
92 1.000000 3f800001
93 1.000000 3f800001
94 1.000000 3f800001
95 1.000000 3f800001
96 1.000000 3f800001
97 1.000000 3f800001
98 1.000000 3f800001
99 1.000000 3f800001
100 1.000000 3f800001
101 1.000000 3f800001
102 1.000000 3f800001
103 1.000000 3f800001
104 1.000000 3f800001
105 1.000000 3f800001
106 1.000000 3f800001
107 1.000000 3f800001
108 1.000000 3f800001
109 1.000000 3f800001
110 1.000000 3f800001
111 1.000000 3f800001
112 1.000000 3f800001
113 1.000000 3f800001
114 1.000000 3f800001
115 1.000000 3f800001
116 1.000000 3f800001
117 1.000000 3f800001
118 1.000000 3f800001
119 1.000000 3f800001
120 1.000000 3f800001
121 1.000000 3f800001
122 1.000000 3f800001
123 1.000000 3f800001
124 1.000000 3f800001
125 1.000000 3f800001
126 1.000000 3f800001
127 1.000000 3f800001
128 1.000000 3f800001
129 1.000000 3f800001
130 1.000000 3f800001
131 1.000000 3f800001
132 1.000000 3f800001
133 1.000000 3f800001
134 1.000000 3f800001
135 1.000000 3f800001
136 1.000000 3f800001
137 1.000000 3f800001
138 1.000000 3f800001
139 1.000000 3f800001
140 1.000000 3f800001
141 1.000000 3f800001
142 1.000000 3f800001
143 1.000000 3f800001
144 1.000000 3f800001
145 1.000000 3f800001
146 1.000000 3f800001
147 1.000000 3f800001
148 1.000000 3f800001
149 1.000000 3f800000
0 1.500000 3fc00000
1 1.250000 3fa00000
2 1.125000 3f900000
3 1.062500 3f880000
4 1.031250 3f840000
5 1.015625 3f820000
6 1.007813 3f810000
7 1.003906 3f808000
8 1.001953 3f804000
9 1.000977 3f802000
10 1.000488 3f801000
11 1.000244 3f800800
12 1.000122 3f800400
13 1.000061 3f800200
14 1.000031 3f800100
15 1.000015 3f800080
16 1.000008 3f800040
17 1.000004 3f800020
18 1.000002 3f800010
19 1.000001 3f800008
20 1.000000 3f800004
21 1.000000 3f800002
22 1.000000 3f800001
23 1.000000 3f800000
0 1.500000 3fc00000
1 1.250000 3fa00000
2 1.125000 3f900000
3 1.062500 3f880000
4 1.031250 3f840000
5 1.015625 3f820000
6 1.007813 3f810000
7 1.003906 3f808000
8 1.001953 3f804000
9 1.000977 3f802000
10 1.000488 3f801000
11 1.000244 3f800800
12 1.000122 3f800400
13 1.000061 3f800200
14 1.000031 3f800100
15 1.000015 3f800080
16 1.000008 3f800040
17 1.000004 3f800020
18 1.000002 3f800010
19 1.000001 3f800008
20 1.000000 3f800004
21 1.000000 3f800002
22 1.000000 3f800001
23 1.000000 3f800000
0 1.500000 3fc00000
1 1.250000 3fa00000
2 1.125000 3f900000
3 1.062500 3f880000
4 1.031250 3f840000
5 1.015625 3f820000
6 1.007813 3f810000
7 1.003906 3f808000
8 1.001953 3f804000
9 1.000977 3f802000
10 1.000488 3f801000
11 1.000244 3f800800
12 1.000122 3f800400
13 1.000061 3f800200
14 1.000031 3f800100
15 1.000015 3f800080
16 1.000008 3f800040
17 1.000004 3f800020
18 1.000002 3f800010
19 1.000001 3f800008
20 1.000000 3f800004
21 1.000000 3f800002
22 1.000000 3f800001
23 1.000000 3f800000
24 1.500000 3fc00000
25 1.250000 3fa00000
26 1.125000 3f900000
27 1.062500 3f880000
28 1.031250 3f840000
29 1.015625 3f820000
30 1.007813 3f810000
31 1.003906 3f808000
32 1.001953 3f804000
33 1.000977 3f802000
34 1.000488 3f801000
35 1.000244 3f800800
36 1.000122 3f800400
37 1.000061 3f800200
38 1.000031 3f800100
39 1.000015 3f800080
40 1.000008 3f800040
41 1.000004 3f800020
42 1.000002 3f800010
43 1.000001 3f800008
44 1.000000 3f800004
45 1.000000 3f800002
46 1.000000 3f800001
47 1.000000 3f800000
48 1.000000 3f800000
49 1.000000 3f800000
50 1.000000 3f800000
51 1.000000 3f800000
52 1.000000 3f800000
53 1.000000 3f800000
54 1.000000 3f800000
55 1.000000 3f800000
56 1.000000 3f800000
57 1.000000 3f800000
58 1.000000 3f800000
59 1.000000 3f800000
60 1.000000 3f800000
61 1.000000 3f800000
62 1.000000 3f800000
63 1.000000 3f800000
64 1.000000 3f800000
65 1.000000 3f800000
66 1.000000 3f800000
67 1.000000 3f800000
68 1.000000 3f800000
69 1.000000 3f800000
70 1.000000 3f800000
71 1.000000 3f800000
72 1.000000 3f800000
73 1.000000 3f800000
74 1.000000 3f800000
75 1.000000 3f800000
76 1.000000 3f800000
77 1.000000 3f800000
78 1.000000 3f800000
79 1.000000 3f800000
80 1.000000 3f800000
81 1.000000 3f800000
82 1.000000 3f800000
83 1.000000 3f800000
84 1.000000 3f800000
85 1.000000 3f800000
86 1.000000 3f800000
87 1.000000 3f800000
88 1.000000 3f800000
89 1.000000 3f800000
90 1.000000 3f800000
91 1.000000 3f800000
92 1.000000 3f800000
93 1.000000 3f800000
94 1.000000 3f800000
95 1.000000 3f800000
96 1.000000 3f800000
97 1.000000 3f800000
98 1.000000 3f800000
99 1.000000 3f800000
100 1.000000 3f800000
101 1.000000 3f800000
102 1.000000 3f800000
103 1.000000 3f800000
104 1.000000 3f800000
105 1.000000 3f800000
106 1.000000 3f800000
107 1.000000 3f800000
108 1.000000 3f800000
109 1.000000 3f800000
110 1.000000 3f800000
111 1.000000 3f800000
112 1.000000 3f800000
113 1.000000 3f800000
114 1.000000 3f800000
115 1.000000 3f800000
116 1.000000 3f800000
117 1.000000 3f800000
118 1.000000 3f800000
119 1.000000 3f800000
120 1.000000 3f800000
121 1.000000 3f800000
122 1.000000 3f800000
123 1.000000 3f800000
124 1.000000 3f800000
125 1.000000 3f800000
126 1.000000 3f800000
127 1.000000 3f800000
128 1.000000 3f800000
129 1.000000 3f800000
130 101.000000 42ca0000
[/codebox]