Alright so I’ve found another interesting little bit with my system, toolkit 3.1, and the output from it. I’ve confirmed my code compiles with the same extra MOV instructions regardless of the max register count specified, regardless of which architecture the code is compiled for, and with several different computers. I have also tried switching from unsigned to signed integers and all that does is change the .u32 to .s32 for the min and max instructions.
I cannot get rid of these!!!
Here is the minimal C code that produces the odd PTX output. Using some compiler unrolling the code actually expands out to statically access the arrays pixels and buf, which causes the data to stay in registers…so there is no local memory usage. Each thread sorts 4 pieces of input data using odd-even sort and then stores the sorted data back to global memory.
In theory each swap of the data should only take two instructions because I have two buffers for sorting.
[codebox]global void sort (unsigned int g_pixels[4][256])
{
register unsigned int pixels[4];
register unsigned int buf[4];
#pragma unroll
for (unsigned int j = 0; j < 4; j++)
{
pixels[j] = g_pixels[j][threadIdx.x];
}
#pragma unroll
for (unsigned int j = 0; j < 4; j++)
{
#pragma unroll
for (unsigned int i = 0; i < 4; i+=2)
{
buf[i] = min(pixels[i], pixels[i+1]);
buf[i+1] = max(pixels[i], pixels[i+1]);
}
#pragma unroll
for (unsigned int i = 1; i < 3; i+=2)
{
pixels[i] = min(buf[i], buf[i+1]);
pixels[i+1] = max(buf[i], buf[i+1]);
}
pixels[0] = buf[0];
pixels[3] = buf[3];
}
#pragma unroll
for (unsigned int j = 0; j < 4; j++)
{
g_pixels[j][threadIdx.x] = pixels[j];
}
}[/codebox]
This is the PTX output from it. You do not need to read all of it, but take note that there are plenty of extra MOV instructions that are not needed.
[codebox] .entry _Z4sortPA256_j (
.param .u64 __cudaparm__Z4sortPA256_j_g_pixels)
{
.reg .u32 %r<90>;
.reg .u64 %rd<6>;
// pixels = 0
// buf = 16
.loc 28 1 0
$LDWbegin__Z4sortPA256_j:
.loc 28 9 0
ld.param.u64 %rd1, [__cudaparm__Z4sortPA256_j_g_pixels];
cvt.u64.u32 %rd2, %tid.x;
mov.s32 %r1, %tid.x;
mul.wide.u32 %rd3, %r1, 4;
add.u64 %rd4, %rd1, %rd3;
ld.global.u32 %r2, [%rd4+0];
mov.s32 %r3, %r2;
ld.global.u32 %r4, [%rd4+1024];
mov.s32 %r5, %r4;
ld.global.u32 %r6, [%rd4+2048];
mov.s32 %r7, %r6;
ld.global.u32 %r8, [%rd4+3072];
mov.s32 %r9, %r8;
.loc 28 18 0
mov.s32 %r10, %r5;
mov.s32 %r11, %r3;
min.u32 %r12, %r10, %r11;
mov.s32 %r13, %r12;
.loc 28 19 0
max.u32 %r14, %r10, %r11;
mov.s32 %r15, %r14;
.loc 28 18 0
mov.s32 %r16, %r7;
min.u32 %r17, %r8, %r16;
mov.s32 %r18, %r17;
.loc 28 19 0
max.u32 %r19, %r8, %r16;
mov.s32 %r20, %r19;
.loc 28 25 0
mov.s32 %r21, %r18;
mov.s32 %r22, %r15;
min.u32 %r23, %r21, %r22;
mov.s32 %r24, %r23;
.loc 28 26 0
max.u32 %r25, %r21, %r22;
mov.s32 %r26, %r25;
.loc 28 29 0
mov.s32 %r27, %r13;
mov.s32 %r28, %r27;
.loc 28 30 0
mov.s32 %r29, %r19;
.loc 28 18 0
mov.s32 %r30, %r24;
min.u32 %r31, %r30, %r27;
mov.s32 %r32, %r31;
.loc 28 19 0
max.u32 %r33, %r30, %r27;
mov.s32 %r34, %r33;
.loc 28 18 0
mov.s32 %r35, %r26;
min.u32 %r36, %r19, %r35;
mov.s32 %r37, %r36;
.loc 28 19 0
max.u32 %r38, %r19, %r35;
mov.s32 %r39, %r38;
.loc 28 25 0
mov.s32 %r40, %r37;
mov.s32 %r41, %r34;
min.u32 %r42, %r40, %r41;
mov.s32 %r43, %r42;
.loc 28 26 0
max.u32 %r44, %r40, %r41;
mov.s32 %r45, %r44;
.loc 28 29 0
mov.s32 %r46, %r32;
mov.s32 %r47, %r46;
.loc 28 30 0
mov.s32 %r48, %r38;
.loc 28 18 0
mov.s32 %r49, %r43;
min.u32 %r50, %r49, %r46;
mov.s32 %r51, %r50;
.loc 28 19 0
max.u32 %r52, %r49, %r46;
mov.s32 %r53, %r52;
.loc 28 18 0
mov.s32 %r54, %r45;
min.u32 %r55, %r38, %r54;
mov.s32 %r56, %r55;
.loc 28 19 0
max.u32 %r57, %r38, %r54;
mov.s32 %r58, %r57;
.loc 28 25 0
mov.s32 %r59, %r56;
mov.s32 %r60, %r53;
min.u32 %r61, %r59, %r60;
mov.s32 %r62, %r61;
.loc 28 26 0
max.u32 %r63, %r59, %r60;
mov.s32 %r64, %r63;
.loc 28 29 0
mov.s32 %r65, %r51;
mov.s32 %r66, %r65;
.loc 28 30 0
mov.s32 %r67, %r57;
.loc 28 18 0
mov.s32 %r68, %r62;
min.u32 %r69, %r68, %r65;
mov.s32 %r70, %r69;
.loc 28 19 0
max.u32 %r71, %r68, %r65;
mov.s32 %r72, %r71;
.loc 28 18 0
mov.s32 %r73, %r64;
min.u32 %r74, %r57, %r73;
mov.s32 %r75, %r74;
.loc 28 19 0
max.u32 %r76, %r57, %r73;
mov.s32 %r77, %r76;
.loc 28 25 0
mov.s32 %r78, %r75;
mov.s32 %r79, %r72;
min.u32 %r80, %r78, %r79;
mov.s32 %r81, %r80;
.loc 28 26 0
max.u32 %r82, %r78, %r79;
mov.s32 %r83, %r82;
.loc 28 29 0
mov.s32 %r84, %r70;
mov.s32 %r85, %r84;
.loc 28 30 0
mov.s32 %r86, %r76;
.loc 28 36 0
st.global.u32 [%rd4+0], %r84;
mov.s32 %r87, %r81;
st.global.u32 [%rd4+1024], %r87;
mov.s32 %r88, %r83;
st.global.u32 [%rd4+2048], %r88;
st.global.u32 [%rd4+3072], %r76;
.loc 28 38 0
exit;
$LDWend__Z4sortPA256_j:
} // _Z4sortPA256_j[/codebox]
The real code (not going to post it) has the same behavior but the data set is larger (but not large enough for warp level sorting to make sense and I would really like to avoid burning the extra instructions to do shared memory I/O). The kernel is definitely compute limited (about 75% of the kernel run-time is taken up by this kind of code) so these extra instructions are really dragging the performance down. At least I am pretty sure it is being dragged down, but I would like to confirm by actually getting this code working the way I want it to.
I have tried using decuda, but it looks like I would need to roll back to 2.3 in order to generate cubin files that are compatible with the disassembler. Can’t do that yet.
What am I missing here? A straight up bubble sort with a single buffer has no extra MOV instructions but each iteration takes 3-4 instructions rather than 2, which is not desirable.