Extra MOV instructions? ...doubles the number of instructions executed...

Greg_Ross · July 14, 2010, 6:27pm

Alright so I’ve found another interesting little bit with my system, toolkit 3.1, and the output from it. I’ve confirmed my code compiles with the same extra MOV instructions regardless of the max register count specified, regardless of which architecture the code is compiled for, and with several different computers. I have also tried switching from unsigned to signed integers and all that does is change the .u32 to .s32 for the min and max instructions.

I cannot get rid of these!!!

Here is the minimal C code that produces the odd PTX output. Using some compiler unrolling the code actually expands out to statically access the arrays pixels and buf, which causes the data to stay in registers…so there is no local memory usage. Each thread sorts 4 pieces of input data using odd-even sort and then stores the sorted data back to global memory.

In theory each swap of the data should only take two instructions because I have two buffers for sorting.

[codebox]global void sort (unsigned int g_pixels[4][256])

{

register unsigned int pixels[4];

register unsigned int buf[4];

#pragma unroll

for (unsigned int j = 0; j < 4; j++)

{

	pixels[j] = g_pixels[j][threadIdx.x];

}

#pragma unroll

for (unsigned int j = 0; j < 4; j++)

{

	#pragma unroll

	for (unsigned int i = 0; i < 4; i+=2)

	{

		buf[i] = min(pixels[i], pixels[i+1]);

		buf[i+1] = max(pixels[i], pixels[i+1]);

	}

	#pragma unroll

	for (unsigned int i = 1; i < 3; i+=2)

	{

		pixels[i] = min(buf[i], buf[i+1]);

		pixels[i+1] = max(buf[i], buf[i+1]);

	}

	pixels[0] = buf[0];

	pixels[3] = buf[3];

}

#pragma unroll

for (unsigned int j = 0; j < 4; j++)

{

	g_pixels[j][threadIdx.x] = pixels[j];

}

}[/codebox]

This is the PTX output from it. You do not need to read all of it, but take note that there are plenty of extra MOV instructions that are not needed.

[codebox] .entry _Z4sortPA256_j (

	.param .u64 __cudaparm__Z4sortPA256_j_g_pixels)

{

.reg .u32 %r<90>;

.reg .u64 %rd<6>;

// pixels = 0

// buf = 16

.loc	28	1	0

$LDWbegin__Z4sortPA256_j:

.loc	28	9	0

ld.param.u64 	%rd1, [__cudaparm__Z4sortPA256_j_g_pixels];

cvt.u64.u32 	%rd2, %tid.x;

mov.s32 	%r1, %tid.x;

mul.wide.u32 	%rd3, %r1, 4;

add.u64 	%rd4, %rd1, %rd3;

ld.global.u32 	%r2, [%rd4+0];

mov.s32 	%r3, %r2;

ld.global.u32 	%r4, [%rd4+1024];

mov.s32 	%r5, %r4;

ld.global.u32 	%r6, [%rd4+2048];

mov.s32 	%r7, %r6;

ld.global.u32 	%r8, [%rd4+3072];

mov.s32 	%r9, %r8;

.loc	28	18	0

mov.s32 	%r10, %r5;

mov.s32 	%r11, %r3;

min.u32 	%r12, %r10, %r11;

mov.s32 	%r13, %r12;

.loc	28	19	0

max.u32 	%r14, %r10, %r11;

mov.s32 	%r15, %r14;

.loc	28	18	0

mov.s32 	%r16, %r7;

min.u32 	%r17, %r8, %r16;

mov.s32 	%r18, %r17;

.loc	28	19	0

max.u32 	%r19, %r8, %r16;

mov.s32 	%r20, %r19;

.loc	28	25	0

mov.s32 	%r21, %r18;

mov.s32 	%r22, %r15;

min.u32 	%r23, %r21, %r22;

mov.s32 	%r24, %r23;

.loc	28	26	0

max.u32 	%r25, %r21, %r22;

mov.s32 	%r26, %r25;

.loc	28	29	0

mov.s32 	%r27, %r13;

mov.s32 	%r28, %r27;

.loc	28	30	0

mov.s32 	%r29, %r19;

.loc	28	18	0

mov.s32 	%r30, %r24;

min.u32 	%r31, %r30, %r27;

mov.s32 	%r32, %r31;

.loc	28	19	0

max.u32 	%r33, %r30, %r27;

mov.s32 	%r34, %r33;

.loc	28	18	0

mov.s32 	%r35, %r26;

min.u32 	%r36, %r19, %r35;

mov.s32 	%r37, %r36;

.loc	28	19	0

max.u32 	%r38, %r19, %r35;

mov.s32 	%r39, %r38;

.loc	28	25	0

mov.s32 	%r40, %r37;

mov.s32 	%r41, %r34;

min.u32 	%r42, %r40, %r41;

mov.s32 	%r43, %r42;

.loc	28	26	0

max.u32 	%r44, %r40, %r41;

mov.s32 	%r45, %r44;

.loc	28	29	0

mov.s32 	%r46, %r32;

mov.s32 	%r47, %r46;

.loc	28	30	0

mov.s32 	%r48, %r38;

.loc	28	18	0

mov.s32 	%r49, %r43;

min.u32 	%r50, %r49, %r46;

mov.s32 	%r51, %r50;

.loc	28	19	0

max.u32 	%r52, %r49, %r46;

mov.s32 	%r53, %r52;

.loc	28	18	0

mov.s32 	%r54, %r45;

min.u32 	%r55, %r38, %r54;

mov.s32 	%r56, %r55;

.loc	28	19	0

max.u32 	%r57, %r38, %r54;

mov.s32 	%r58, %r57;

.loc	28	25	0

mov.s32 	%r59, %r56;

mov.s32 	%r60, %r53;

min.u32 	%r61, %r59, %r60;

mov.s32 	%r62, %r61;

.loc	28	26	0

max.u32 	%r63, %r59, %r60;

mov.s32 	%r64, %r63;

.loc	28	29	0

mov.s32 	%r65, %r51;

mov.s32 	%r66, %r65;

.loc	28	30	0

mov.s32 	%r67, %r57;

.loc	28	18	0

mov.s32 	%r68, %r62;

min.u32 	%r69, %r68, %r65;

mov.s32 	%r70, %r69;

.loc	28	19	0

max.u32 	%r71, %r68, %r65;

mov.s32 	%r72, %r71;

.loc	28	18	0

mov.s32 	%r73, %r64;

min.u32 	%r74, %r57, %r73;

mov.s32 	%r75, %r74;

.loc	28	19	0

max.u32 	%r76, %r57, %r73;

mov.s32 	%r77, %r76;

.loc	28	25	0

mov.s32 	%r78, %r75;

mov.s32 	%r79, %r72;

min.u32 	%r80, %r78, %r79;

mov.s32 	%r81, %r80;

.loc	28	26	0

max.u32 	%r82, %r78, %r79;

mov.s32 	%r83, %r82;

.loc	28	29	0

mov.s32 	%r84, %r70;

mov.s32 	%r85, %r84;

.loc	28	30	0

mov.s32 	%r86, %r76;

.loc	28	36	0

st.global.u32 	[%rd4+0], %r84;

mov.s32 	%r87, %r81;

st.global.u32 	[%rd4+1024], %r87;

mov.s32 	%r88, %r83;

st.global.u32 	[%rd4+2048], %r88;

st.global.u32 	[%rd4+3072], %r76;

.loc	28	38	0

exit;

$LDWend__Z4sortPA256_j:

} // _Z4sortPA256_j[/codebox]

The real code (not going to post it) has the same behavior but the data set is larger (but not large enough for warp level sorting to make sense and I would really like to avoid burning the extra instructions to do shared memory I/O). The kernel is definitely compute limited (about 75% of the kernel run-time is taken up by this kind of code) so these extra instructions are really dragging the performance down. At least I am pretty sure it is being dragged down, but I would like to confirm by actually getting this code working the way I want it to.

I have tried using decuda, but it looks like I would need to roll back to 2.3 in order to generate cubin files that are compatible with the disassembler. Can’t do that yet.

What am I missing here? A straight up bubble sort with a single buffer has no extra MOV instructions but each iteration takes 3-4 instructions rather than 2, which is not desirable.

tera · July 14, 2010, 6:49pm

The movs get optimized out when ptx is compiled to cubin.

To reassure you, I’ve just run your example through decuda. Here’s the ptx generated by my old CUDA 2.1 installation:

[codebox]

.entry _Z4sortPA256_j

{

.reg .u16 %rh<4>;

.reg .u32 %r<28>;

.reg .u64 %rd<6>;

.reg .pred %p<3>;

.param .u64 __cudaparm__Z4sortPA256_j_g_pixels;

// pixels = 0

// buf = 16

.loc    16  1   0

$LBB1__Z4sortPA256_j:

.loc    16  9   0

ld.param.u64    %rd1, [__cudaparm__Z4sortPA256_j_g_pixels]; // id:102 __cudaparm__Z4sortPA256_j_g_pixels+0x0

cvt.u32.u16     %r1, %tid.x;        //

cvt.u64.u32     %rd2, %r1;          //

mul.lo.u64  %rd3, %rd2, 4;      //

add.u64     %rd4, %rd1, %rd3;       //

ld.global.u32   %r2, [%rd4+0];  // id:104

mov.s32     %r3, %r2;               //

ld.global.u32   %r4, [%rd4+1024];   // id:105

mov.s32     %r5, %r4;               //

ld.global.u32   %r6, [%rd4+2048];   // id:106

mov.s32     %r7, %r6;               //

ld.global.u32   %r8, [%rd4+3072];   // id:107

mov.s32     %r9, %r8;               //

mov.u16     %rh1, 0;                //

mov.s32     %r10, %r3;              //

$Lt_0_23:

// Loop body line 9, nesting depth: 1, iterations: 4

.loc    16  18  0

mov.s32     %r11, %r5;              //

min.u32     %r12, %r11, %r10;       //

mov.s32     %r13, %r12;             //

.loc    16  19  0

max.u32     %r14, %r11, %r10;       //

mov.s32     %r15, %r14;             //

.loc    16  18  0

mov.s32     %r16, %r7;              //

min.u32     %r17, %r8, %r16;        //

mov.s32     %r18, %r17;             //

.loc    16  19  0

max.u32     %r19, %r8, %r16;        //

mov.s32     %r20, %r19;             //

.loc    16  25  0

mov.s32     %r21, %r18;             //

mov.s32     %r22, %r15;             //

min.u32     %r23, %r21, %r22;       //

mov.s32     %r5, %r23;              //

.loc    16  26  0

max.u32     %r24, %r21, %r22;       //

mov.s32     %r7, %r24;              //

.loc    16  29  0

mov.s32     %r10, %r13;             //

mov.s32     %r3, %r10;              //

.loc    16  30  0

mov.s32     %r8, %r19;              //

mov.s32     %r9, %r8;               //

add.u16     %rh1, %rh1, 1;          //

mov.u16     %rh2, 4;                //

setp.ne.u16     %p1, %rh1, %rh2;    //

@%p1 bra    $Lt_0_23;               //

.loc    16  36  0

st.global.u32   [%rd4+0], %r10; // id:110

mov.s32     %r25, %r5;              //

st.global.u32   [%rd4+1024], %r25;  // id:111

mov.s32     %r26, %r7;              //

st.global.u32   [%rd4+2048], %r26;  // id:112

st.global.u32   [%rd4+3072], %r19;  // id:113

.loc    16  38  0

exit;                           //

$LDWend__Z4sortPA256_j:

} // _Z4sortPA256_j

[/codebox]

and here is decuda’s disassembly:

// Disassembling _Z4sortPA256_j

000000: a0000001 04000780 cvt.rn.u32.u16 $r0, $r0.lo

000008: 30020001 c4100780 shl.u32 $r0, $r0, 0x00000002

000010: 2000c809 04200780 add.u32 $r2, s[0x0010], $r0

000018: 10000009 00000003 mov.b16 $r1.lo, 0x0000

000020: 20008401 00000043 add.b32 $r0, $r2, 0x00000400

000028: 20008411 00000083 add.b32 $r4, $r2, 0x00000800

000030: 20008419 000000c3 add.b32 $r6, $r2, 0x00000c00

000038: d00e0415 80c00780 mov.u32 $r5, g[$r2]

000040: d00e000d 80c00780 mov.u32 $r3, g[$r0]

000048: d00e0811 80c00780 mov.u32 $r4, g[$r4]

000050: d00e0c01 80c00780 mov.u32 $r0, g[$r6]

000058: 30050619 84000780 label0: max.u32 $r6, $r3, $r5

000060: 3004001d a4000780 min.u32 $r7, $r0, $r4

000068: 20010409 00000003 add.b16 $r1.lo, $r1.lo, 0x0001

000070: 30050615 a4000780 min.u32 $r5, $r3, $r5

000078: 30040001 84000780 max.u32 $r0, $r0, $r4

000080: 308005fd 604147c8 set.ne.u16 $p0|$o127, $r1.lo, c1[0x0000]

000088: 30060e0d a4000780 min.u32 $r3, $r7, $r6

000090: 30060e11 84000780 max.u32 $r4, $r7, $r6

000098: 1000b003 00000280 @$p0.ne bra.label label0

0000a0: d00e0415 a0c00780 mov.u32 g[$r2], $r5

0000a8: 20008405 00000043 add.b32 $r1, $r2, 0x00000400

0000b0: d00e020d a0c00780 mov.u32 g[$r1], $r3

0000b8: 20008405 00000083 add.b32 $r1, $r2, 0x00000800

0000c0: d00e0211 a0c00780 mov.u32 g[$r1], $r4

0000c8: 20008405 000000c3 add.b32 $r1, $r2, 0x00000c00

0000d0: d00e0201 a0c00781 mov.end.u32 g[$r1], $r0

// segment: const (1:0000)

0000: 00000004

Greg_Ross · July 14, 2010, 6:54pm

Well that is annoying. Thanks for trying it (I’ll just have to set up a 2.x system). Any idea when decuda might work with 3.x?

Those extra add and set instructions are interesting though…I’m going to re-read that output over lunch. Wait…is the compiler optimizing out the loop unrolling, or did you just take that out?

tera · July 14, 2010, 8:10pm

Interesting find! I had blindly run your example, as-is, through nvcc and decuda.

Here’s what happens in current CUDA 3.1 - first the PTX again:

[codebox]

    .entry _Z4sortPA256_j (

            .param .u32 __cudaparm__Z4sortPA256_j_g_pixels)

    {

    .reg .u16 %rh<3>;

    .reg .u32 %r<92>;

    .loc    28      1       0

$LDWbegin__Z4sortPA256_j:

    .loc    28      9       0

    mov.u16         %rh1, %tid.x;

    mul.wide.u16    %r1, %rh1, 4;

    ld.param.u32    %r2, [__cudaparm__Z4sortPA256_j_g_pixels];

    add.u32         %r3, %r2, %r1;

    ld.global.u32   %r4, [%r3+0];

    mov.s32         %r5, %r4;

    ld.global.u32   %r6, [%r3+1024];

    mov.s32         %r7, %r6;

    ld.global.u32   %r8, [%r3+2048];

    mov.s32         %r9, %r8;

    ld.global.u32   %r10, [%r3+3072];

    mov.s32         %r11, %r10;

    .loc    28      18      0

    mov.s32         %r12, %r7;

    mov.s32         %r13, %r5;

    min.u32         %r14, %r12, %r13;

    mov.s32         %r15, %r14;

    .loc    28      19      0

    max.u32         %r16, %r12, %r13;

    mov.s32         %r17, %r16;

    .loc    28      18      0

    mov.s32         %r18, %r9;

    min.u32         %r19, %r10, %r18;

    mov.s32         %r20, %r19;

    .loc    28      19      0

    max.u32         %r21, %r10, %r18;

    mov.s32         %r22, %r21;

    .loc    28      25      0

    mov.s32         %r23, %r20;

    mov.s32         %r24, %r17;

    min.u32         %r25, %r23, %r24;

    mov.s32         %r26, %r25;

    .loc    28      26      0

    max.u32         %r27, %r23, %r24;

    mov.s32         %r28, %r27;

    .loc    28      29      0

    mov.s32         %r29, %r15;

    mov.s32         %r30, %r29;

    .loc    28      30      0

    mov.s32         %r31, %r21;

    .loc    28      18      0

    mov.s32         %r32, %r26;

    min.u32         %r33, %r32, %r29;

    mov.s32         %r34, %r33;

    .loc    28      19      0

    max.u32         %r35, %r32, %r29;

    mov.s32         %r36, %r35;

    .loc    28      18      0

    mov.s32         %r37, %r28;

    min.u32         %r38, %r21, %r37;

    mov.s32         %r39, %r38;

    .loc    28      19      0

    max.u32         %r40, %r21, %r37;

    mov.s32         %r41, %r40;

    .loc    28      25      0

    mov.s32         %r42, %r39;

    mov.s32         %r43, %r36;

    min.u32         %r44, %r42, %r43;

    mov.s32         %r45, %r44;

    .loc    28      26      0

    max.u32         %r46, %r42, %r43;

    mov.s32         %r47, %r46;

    .loc    28      29      0

    mov.s32         %r48, %r34;

    mov.s32         %r49, %r48;

    .loc    28      30      0

    mov.s32         %r50, %r40;

    .loc    28      18      0

    mov.s32         %r51, %r45;

    min.u32         %r52, %r51, %r48;

    mov.s32         %r53, %r52;

    .loc    28      19      0

    max.u32         %r54, %r51, %r48;

    mov.s32         %r55, %r54;

    .loc    28      18      0

    mov.s32         %r56, %r47;

    min.u32         %r57, %r40, %r56;

    mov.s32         %r58, %r57;

    .loc    28      19      0

    max.u32         %r59, %r40, %r56;

    mov.s32         %r60, %r59;

    .loc    28      25      0

    mov.s32         %r61, %r58;

    mov.s32         %r62, %r55;

    min.u32         %r63, %r61, %r62;

    mov.s32         %r64, %r63;

    .loc    28      26      0

    max.u32         %r65, %r61, %r62;

    mov.s32         %r66, %r65;

    .loc    28      29      0

    mov.s32         %r67, %r53;

    mov.s32         %r68, %r67;

    .loc    28      30      0

    mov.s32         %r69, %r59;

    .loc    28      18      0

    mov.s32         %r70, %r64;

    min.u32         %r71, %r70, %r67;

    mov.s32         %r72, %r71;

    .loc    28      19      0

    max.u32         %r73, %r70, %r67;

    mov.s32         %r74, %r73;

    .loc    28      18      0

    mov.s32         %r75, %r66;

    min.u32         %r76, %r59, %r75;

    mov.s32         %r77, %r76;

    .loc    28      19      0

    max.u32         %r78, %r59, %r75;

    mov.s32         %r79, %r78;

    .loc    28      25      0

    mov.s32         %r80, %r77;

    mov.s32         %r81, %r74;

    min.u32         %r82, %r80, %r81;

    mov.s32         %r83, %r82;

    .loc    28      26      0

    max.u32         %r84, %r80, %r81;

    mov.s32         %r85, %r84;

    .loc    28      29      0

    mov.s32         %r86, %r72;

    mov.s32         %r87, %r86;

    .loc    28      30      0

    mov.s32         %r88, %r78;

    .loc    28      36      0

    st.global.u32   [%r3+0], %r86;

    mov.s32         %r89, %r83;

    st.global.u32   [%r3+1024], %r89;

    mov.s32         %r90, %r85;

    st.global.u32   [%r3+2048], %r90;

    st.global.u32   [%r3+3072], %r78;

    .loc    28      38      0

    exit;

$LDWend__Z4sortPA256_j:

    } // _Z4sortPA256_j

[/codebox]

And here’s the disassembled .cubin (see this thread for info on how to get decuda to work with 3.0 or 3.1 cubins):

// Disassembling _Z4sortPA256_j

000000: 1000c805 0423c780 mov.b32 $r1, s[0x0010]

000008: 60800001 00404780 mad24.lo.u32.u16.u16.u32 $r0, $r0.lo, c1[0x0000], $r1

000010: 2000800d 00000043 add.b32 $r3, $r0, 0x00000400

000018: 20008009 00000083 add.b32 $r2, $r0, 0x00000800

000020: 20008005 000000c3 add.b32 $r1, $r0, 0x00000c00

000028: d00e0021 80c00780 mov.u32 $r8, g[$r0]

000030: d00e0605 80c00780 mov.u32 $r1, g[$r3]

000038: d00e0419 80c00780 mov.u32 $r6, g[$r2]

000040: d00e020d 80c00780 mov.u32 $r3, g[$r1]

000048: 30081215 84000780 max.u32 $r5, $r9, $r8

000050: 30060e01 a4000780 min.u32 $r0, $r7, $r6

000058: 30081225 a4000780 min.u32 $r9, $r9, $r8

000060: 30060e01 84000780 max.u32 $r0, $r7, $r6

000068: 30050819 a4000780 min.u32 $r6, $r4, $r5

000070: 3005080d 84000780 max.u32 $r3, $r4, $r5

000078: 30090c15 84000780 max.u32 $r5, $r6, $r9

000080: 30071001 a4000780 min.u32 $r0, $r8, $r7

000088: 30090c19 a4000780 min.u32 $r6, $r6, $r9

000090: 3007100d 84000780 max.u32 $r3, $r8, $r7

000098: 30050821 a4000780 min.u32 $r8, $r4, $r5

0000a0: 30050805 84000780 max.u32 $r1, $r4, $r5

0000a8: 30061011 84000780 max.u32 $r4, $r8, $r6

0000b0: 30090e05 a4000780 min.u32 $r1, $r7, $r9

0000b8: 30061021 a4000780 min.u32 $r8, $r8, $r6

0000c0: 30040a09 a4000780 min.u32 $r2, $r5, $r4

0000c8: 30090e1d 84000780 max.u32 $r7, $r7, $r9

0000d0: 30040a01 84000780 max.u32 $r0, $r5, $r4

0000d8: 30080c15 a4000780 min.u32 $r5, $r6, $r8

0000e0: 30080c01 84000780 max.u32 $r0, $r6, $r8

0000e8: 30040e19 a4000780 min.u32 $r6, $r7, $r4

0000f0: d00e0005 a0c00780 mov.u32 g[$r0], $r1

0000f8: 30080c01 a4000780 min.u32 $r0, $r6, $r8

000100: 30080c05 84000780 max.u32 $r1, $r6, $r8

000108: d00e0601 a0c00780 mov.u32 g[$r3], $r0

000110: 30040e01 84000780 max.u32 $r0, $r7, $r4

000118: d00e0415 a0c00780 mov.u32 g[$r2], $r5

000120: d00e0201 a0c00781 mov.end.u32 g[$r1], $r0

// segment: const (0:0000)

0000: 00000004

So CUDA 3.1 really unrolls the loops without recombining them into one large loop afterwards.

And the movs are still gone. :)

Greg_Ross · July 14, 2010, 8:30pm

Interesting find! I had blindly run your example, as-is, through nvcc and decuda.

Here’s what happens in current CUDA 3.1 - first the PTX again:

[codebox]

    .entry _Z4sortPA256_j (

            .param .u32 __cudaparm__Z4sortPA256_j_g_pixels)

    {

    .reg .u16 %rh<3>;

    .reg .u32 %r<92>;

    .loc    28      1       0

$LDWbegin__Z4sortPA256_j:

    .loc    28      9       0

    mov.u16         %rh1, %tid.x;

    mul.wide.u16    %r1, %rh1, 4;

    ld.param.u32    %r2, [__cudaparm__Z4sortPA256_j_g_pixels];

    add.u32         %r3, %r2, %r1;

    ld.global.u32   %r4, [%r3+0];

    mov.s32         %r5, %r4;

    ld.global.u32   %r6, [%r3+1024];

    mov.s32         %r7, %r6;

    ld.global.u32   %r8, [%r3+2048];

    mov.s32         %r9, %r8;

    ld.global.u32   %r10, [%r3+3072];

    mov.s32         %r11, %r10;

    .loc    28      18      0

    mov.s32         %r12, %r7;

    mov.s32         %r13, %r5;

    min.u32         %r14, %r12, %r13;

    mov.s32         %r15, %r14;

    .loc    28      19      0

    max.u32         %r16, %r12, %r13;

    mov.s32         %r17, %r16;

    .loc    28      18      0

    mov.s32         %r18, %r9;

    min.u32         %r19, %r10, %r18;

    mov.s32         %r20, %r19;

    .loc    28      19      0

    max.u32         %r21, %r10, %r18;

    mov.s32         %r22, %r21;

    .loc    28      25      0

    mov.s32         %r23, %r20;

    mov.s32         %r24, %r17;

    min.u32         %r25, %r23, %r24;

    mov.s32         %r26, %r25;

    .loc    28      26      0

    max.u32         %r27, %r23, %r24;

    mov.s32         %r28, %r27;

    .loc    28      29      0

    mov.s32         %r29, %r15;

    mov.s32         %r30, %r29;

    .loc    28      30      0

    mov.s32         %r31, %r21;

    .loc    28      18      0

    mov.s32         %r32, %r26;

    min.u32         %r33, %r32, %r29;

    mov.s32         %r34, %r33;

    .loc    28      19      0

    max.u32         %r35, %r32, %r29;

    mov.s32         %r36, %r35;

    .loc    28      18      0

    mov.s32         %r37, %r28;

    min.u32         %r38, %r21, %r37;

    mov.s32         %r39, %r38;

    .loc    28      19      0

    max.u32         %r40, %r21, %r37;

    mov.s32         %r41, %r40;

    .loc    28      25      0

    mov.s32         %r42, %r39;

    mov.s32         %r43, %r36;

    min.u32         %r44, %r42, %r43;

    mov.s32         %r45, %r44;

    .loc    28      26      0

    max.u32         %r46, %r42, %r43;

    mov.s32         %r47, %r46;

    .loc    28      29      0

    mov.s32         %r48, %r34;

    mov.s32         %r49, %r48;

    .loc    28      30      0

    mov.s32         %r50, %r40;

    .loc    28      18      0

    mov.s32         %r51, %r45;

    min.u32         %r52, %r51, %r48;

    mov.s32         %r53, %r52;

    .loc    28      19      0

    max.u32         %r54, %r51, %r48;

    mov.s32         %r55, %r54;

    .loc    28      18      0

    mov.s32         %r56, %r47;

    min.u32         %r57, %r40, %r56;

    mov.s32         %r58, %r57;

    .loc    28      19      0

    max.u32         %r59, %r40, %r56;

    mov.s32         %r60, %r59;

    .loc    28      25      0

    mov.s32         %r61, %r58;

    mov.s32         %r62, %r55;

    min.u32         %r63, %r61, %r62;

    mov.s32         %r64, %r63;

    .loc    28      26      0

    max.u32         %r65, %r61, %r62;

    mov.s32         %r66, %r65;

    .loc    28      29      0

    mov.s32         %r67, %r53;

    mov.s32         %r68, %r67;

    .loc    28      30      0

    mov.s32         %r69, %r59;

    .loc    28      18      0

    mov.s32         %r70, %r64;

    min.u32         %r71, %r70, %r67;

    mov.s32         %r72, %r71;

    .loc    28      19      0

    max.u32         %r73, %r70, %r67;

    mov.s32         %r74, %r73;

    .loc    28      18      0

    mov.s32         %r75, %r66;

    min.u32         %r76, %r59, %r75;

    mov.s32         %r77, %r76;

    .loc    28      19      0

    max.u32         %r78, %r59, %r75;

    mov.s32         %r79, %r78;

    .loc    28      25      0

    mov.s32         %r80, %r77;

    mov.s32         %r81, %r74;

    min.u32         %r82, %r80, %r81;

    mov.s32         %r83, %r82;

    .loc    28      26      0

    max.u32         %r84, %r80, %r81;

    mov.s32         %r85, %r84;

    .loc    28      29      0

    mov.s32         %r86, %r72;

    mov.s32         %r87, %r86;

    .loc    28      30      0

    mov.s32         %r88, %r78;

    .loc    28      36      0

    st.global.u32   [%r3+0], %r86;

    mov.s32         %r89, %r83;

    st.global.u32   [%r3+1024], %r89;

    mov.s32         %r90, %r85;

    st.global.u32   [%r3+2048], %r90;

    st.global.u32   [%r3+3072], %r78;

    .loc    28      38      0

    exit;

$LDWend__Z4sortPA256_j:

    } // _Z4sortPA256_j

[/codebox]

And here’s the disassembled .cubin (see this thread for info on how to get decuda to work with 3.0 or 3.1 cubins):

// Disassembling _Z4sortPA256_j

000000: 1000c805 0423c780 mov.b32 $r1, s[0x0010]

000008: 60800001 00404780 mad24.lo.u32.u16.u16.u32 $r0, $r0.lo, c1[0x0000], $r1

000010: 2000800d 00000043 add.b32 $r3, $r0, 0x00000400

000018: 20008009 00000083 add.b32 $r2, $r0, 0x00000800

000020: 20008005 000000c3 add.b32 $r1, $r0, 0x00000c00

000028: d00e0021 80c00780 mov.u32 $r8, g[$r0]

000030: d00e0605 80c00780 mov.u32 $r1, g[$r3]

000038: d00e0419 80c00780 mov.u32 $r6, g[$r2]

000040: d00e020d 80c00780 mov.u32 $r3, g[$r1]

000048: 30081215 84000780 max.u32 $r5, $r9, $r8

000050: 30060e01 a4000780 min.u32 $r0, $r7, $r6

000058: 30081225 a4000780 min.u32 $r9, $r9, $r8

000060: 30060e01 84000780 max.u32 $r0, $r7, $r6

000068: 30050819 a4000780 min.u32 $r6, $r4, $r5

000070: 3005080d 84000780 max.u32 $r3, $r4, $r5

000078: 30090c15 84000780 max.u32 $r5, $r6, $r9

000080: 30071001 a4000780 min.u32 $r0, $r8, $r7

000088: 30090c19 a4000780 min.u32 $r6, $r6, $r9

000090: 3007100d 84000780 max.u32 $r3, $r8, $r7

000098: 30050821 a4000780 min.u32 $r8, $r4, $r5

0000a0: 30050805 84000780 max.u32 $r1, $r4, $r5

0000a8: 30061011 84000780 max.u32 $r4, $r8, $r6

0000b0: 30090e05 a4000780 min.u32 $r1, $r7, $r9

0000b8: 30061021 a4000780 min.u32 $r8, $r8, $r6

0000c0: 30040a09 a4000780 min.u32 $r2, $r5, $r4

0000c8: 30090e1d 84000780 max.u32 $r7, $r7, $r9

0000d0: 30040a01 84000780 max.u32 $r0, $r5, $r4

0000d8: 30080c15 a4000780 min.u32 $r5, $r6, $r8

0000e0: 30080c01 84000780 max.u32 $r0, $r6, $r8

0000e8: 30040e19 a4000780 min.u32 $r6, $r7, $r4

0000f0: d00e0005 a0c00780 mov.u32 g[$r0], $r1

0000f8: 30080c01 a4000780 min.u32 $r0, $r6, $r8

000100: 30080c05 84000780 max.u32 $r1, $r6, $r8

000108: d00e0601 a0c00780 mov.u32 g[$r3], $r0

000110: 30040e01 84000780 max.u32 $r0, $r7, $r4

000118: d00e0415 a0c00780 mov.u32 g[$r2], $r5

000120: d00e0201 a0c00781 mov.end.u32 g[$r1], $r0

// segment: const (0:0000)

0000: 00000004

So CUDA 3.1 really unrolls the loops without recombining them into one large loop afterwards.

And the movs are still gone. :)

Yup, the MOVs are still gone and you now know a nifty trick to use arrays and force them to stay in register space ;).

Looks like this algorithm is bust then…

As for decuda, I tried that new tool you pointed me towards and decuda is still crashing. (EDIT: Got it working, but only for SM1.x cubin files)

tera · July 14, 2010, 8:59pm

You can use the disassembler from the Nouveau project for sm_20 cubins.

Why is the algorithm bust?

Greg_Ross · July 14, 2010, 10:04pm

It is not bust because it does not work, it is busted because it is not substantially faster than the original code I am trying to optimize.

The only way to beat this is to move to an nlog(n) algorithm I think. But to do it without warp divergence…yum! Something tells me bitonic sort might work.

Gregory_Diamos · July 14, 2010, 10:08pm

From that example it looks like you are doing some variation of sorting. At least the generic version of sorting is one of the most studied problems for CUDA. This paper covers the state of the art: [url=“http://www.cs.virginia.edu/~dgm4d/papers/RadixSortTR.pdf”]http://www.cs.virginia.edu/~dgm4d/papers/RadixSortTR.pdf[/url] . I would start there if you haven’t already seen it.

Greg_Ross · July 14, 2010, 10:30pm

I’ll take a look, thanks.

The fun part of this is that I need to sort a set of 8-16 integers, but there are about 1-2M sets of these little buggers to do. So at least right now it makes sense to work on sorting algorithms where each thread sorts a single set of inputs using sorting methods that cause no warp divergence (i.e. bubble sort, even-odd sort).

tera · July 14, 2010, 10:34pm

Since you are only sorting four values, how about this?

__global__ void sort (unsigned int g_pixels[4][256])

{

	register unsigned int pixels[4];

	register unsigned int buf[4];

	#pragma unroll

	for (unsigned int j = 0; j < 4; j++)

	{

		pixels[j] = g_pixels[j][threadIdx.x];

	}

	#pragma unroll

	for (unsigned int i = 0; i < 4; i+=2)

	{

		buf[i] = min(pixels[i], pixels[i+1]);

		buf[i+1] = max(pixels[i], pixels[i+1]);

	}

	#pragma unroll

	for (unsigned int i = 0; i < 2; i++)

	{

		pixels[i] = min(buf[i], buf[i+2]);

		pixels[i+2] = max(buf[i], buf[i+2]);

	}

	buf[1] = min(pixels[1], pixels[2]);

	buf[2] = max(pixels[1], pixels[2]);

	pixels[1] = buf[1];

	pixels[2] = buf[2];

	#pragma unroll

	for (unsigned int j = 0; j < 4; j++)

	{

		g_pixels[j][threadIdx.x] = pixels[j];

	}

}

Greg_Ross · July 15, 2010, 12:45am

Since you are only sorting four values, how about this?

__global__ void sort (unsigned int g_pixels[4][256])

{

	register unsigned int pixels[4];

	register unsigned int buf[4];

	#pragma unroll

	for (unsigned int j = 0; j < 4; j++)

	{

		pixels[j] = g_pixels[j][threadIdx.x];

	}

	#pragma unroll

	for (unsigned int i = 0; i < 4; i+=2)

	{

		buf[i] = min(pixels[i], pixels[i+1]);

		buf[i+1] = max(pixels[i], pixels[i+1]);

	}

	#pragma unroll

	for (unsigned int i = 0; i < 2; i++)

	{

		pixels[i] = min(buf[i], buf[i+2]);

		pixels[i+2] = max(buf[i], buf[i+2]);

	}

	buf[1] = min(pixels[1], pixels[2]);

	buf[2] = max(pixels[1], pixels[2]);

	pixels[1] = buf[1];

	pixels[2] = buf[2];

	#pragma unroll

	for (unsigned int j = 0; j < 4; j++)

	{

		g_pixels[j][threadIdx.x] = pixels[j];

	}

}

Yup, that works for 4 elements. I’ve got working code for 8 too, but this is only sample code.

N is between 3 and 16 inclusive in the real code, so right now I’m stuck hand coding the sorting networks. Not that big of a deal at all, but I’m relatively new to bitonic sort so I am taking my time to really understand it while I am at it.

Bitonic sort takes about 75% of the run time of bubble sort with n=8 which is great. That 75% includes the overhead for global memory transfers, so the actual compute is much faster in reality. Once I get the bigger N’s all figured out I’m sure there will be an even bigger improvement.

It’s funny though that, at least in this case, removing the double buffer and just making code that runs with a temp variable or two is equally as fast.

Topic		Replies	Views
Integer MAD instruction CUDA Programming and Performance	11	17657	October 22, 2010
Redundant MOVs? CUDA Programming and Performance	9	414	January 23, 2023
32-bit number multiplication CUDA Programming and Performance	23	20488	July 1, 2012
Measurements of different CUDA operator throughputs CUDA Programming and Performance	32	49891	August 24, 2009
How to tell nvcc that some `if` must diverge and stop trying to fuse previous statements into it? CUDA Programming and Performance	20	462	March 3, 2024
Cuda compiler loop unroll bug? CUDA Programming and Performance	14	2427	October 25, 2017
Yet another performance question CUDA Programming and Performance	16	4229	February 12, 2009
Can't get ptx to stop using local CUDA Programming and Performance	5	364	March 26, 2024
CUDA 9.2 (9.2.148) Update1, nvcc compiler bug CUDA Programming and Performance	10	1125	August 18, 2018
coalescing struct loading problem CUDA Programming and Performance	21	12717	March 5, 2010

Extra MOV instructions? ...doubles the number of instructions executed...

Related topics