Extra MOV instructions? ...doubles the number of instructions executed...

Alright so I’ve found another interesting little bit with my system, toolkit 3.1, and the output from it. I’ve confirmed my code compiles with the same extra MOV instructions regardless of the max register count specified, regardless of which architecture the code is compiled for, and with several different computers. I have also tried switching from unsigned to signed integers and all that does is change the .u32 to .s32 for the min and max instructions.

I cannot get rid of these!!!

Here is the minimal C code that produces the odd PTX output. Using some compiler unrolling the code actually expands out to statically access the arrays pixels and buf, which causes the data to stay in registers…so there is no local memory usage. Each thread sorts 4 pieces of input data using odd-even sort and then stores the sorted data back to global memory.

In theory each swap of the data should only take two instructions because I have two buffers for sorting.

[codebox]global void sort (unsigned int g_pixels[4][256])

{

register unsigned int pixels[4];

register unsigned int buf[4];

#pragma unroll

for (unsigned int j = 0; j < 4; j++)

{

	pixels[j] = g_pixels[j][threadIdx.x];

}

#pragma unroll

for (unsigned int j = 0; j < 4; j++)

{

	#pragma unroll

	for (unsigned int i = 0; i < 4; i+=2)

	{

		buf[i] = min(pixels[i], pixels[i+1]);

		buf[i+1] = max(pixels[i], pixels[i+1]);

	}

	#pragma unroll

	for (unsigned int i = 1; i < 3; i+=2)

	{

		pixels[i] = min(buf[i], buf[i+1]);

		pixels[i+1] = max(buf[i], buf[i+1]);

	}

	pixels[0] = buf[0];

	pixels[3] = buf[3];

}

#pragma unroll

for (unsigned int j = 0; j < 4; j++)

{

	g_pixels[j][threadIdx.x] = pixels[j];

}

}[/codebox]

This is the PTX output from it. You do not need to read all of it, but take note that there are plenty of extra MOV instructions that are not needed.

[codebox] .entry _Z4sortPA256_j (

	.param .u64 __cudaparm__Z4sortPA256_j_g_pixels)

{

.reg .u32 %r<90>;

.reg .u64 %rd<6>;

// pixels = 0

// buf = 16

.loc	28	1	0

$LDWbegin__Z4sortPA256_j:

.loc	28	9	0

ld.param.u64 	%rd1, [__cudaparm__Z4sortPA256_j_g_pixels];

cvt.u64.u32 	%rd2, %tid.x;

mov.s32 	%r1, %tid.x;

mul.wide.u32 	%rd3, %r1, 4;

add.u64 	%rd4, %rd1, %rd3;

ld.global.u32 	%r2, [%rd4+0];

mov.s32 	%r3, %r2;

ld.global.u32 	%r4, [%rd4+1024];

mov.s32 	%r5, %r4;

ld.global.u32 	%r6, [%rd4+2048];

mov.s32 	%r7, %r6;

ld.global.u32 	%r8, [%rd4+3072];

mov.s32 	%r9, %r8;

.loc	28	18	0

mov.s32 	%r10, %r5;

mov.s32 	%r11, %r3;

min.u32 	%r12, %r10, %r11;

mov.s32 	%r13, %r12;

.loc	28	19	0

max.u32 	%r14, %r10, %r11;

mov.s32 	%r15, %r14;

.loc	28	18	0

mov.s32 	%r16, %r7;

min.u32 	%r17, %r8, %r16;

mov.s32 	%r18, %r17;

.loc	28	19	0

max.u32 	%r19, %r8, %r16;

mov.s32 	%r20, %r19;

.loc	28	25	0

mov.s32 	%r21, %r18;

mov.s32 	%r22, %r15;

min.u32 	%r23, %r21, %r22;

mov.s32 	%r24, %r23;

.loc	28	26	0

max.u32 	%r25, %r21, %r22;

mov.s32 	%r26, %r25;

.loc	28	29	0

mov.s32 	%r27, %r13;

mov.s32 	%r28, %r27;

.loc	28	30	0

mov.s32 	%r29, %r19;

.loc	28	18	0

mov.s32 	%r30, %r24;

min.u32 	%r31, %r30, %r27;

mov.s32 	%r32, %r31;

.loc	28	19	0

max.u32 	%r33, %r30, %r27;

mov.s32 	%r34, %r33;

.loc	28	18	0

mov.s32 	%r35, %r26;

min.u32 	%r36, %r19, %r35;

mov.s32 	%r37, %r36;

.loc	28	19	0

max.u32 	%r38, %r19, %r35;

mov.s32 	%r39, %r38;

.loc	28	25	0

mov.s32 	%r40, %r37;

mov.s32 	%r41, %r34;

min.u32 	%r42, %r40, %r41;

mov.s32 	%r43, %r42;

.loc	28	26	0

max.u32 	%r44, %r40, %r41;

mov.s32 	%r45, %r44;

.loc	28	29	0

mov.s32 	%r46, %r32;

mov.s32 	%r47, %r46;

.loc	28	30	0

mov.s32 	%r48, %r38;

.loc	28	18	0

mov.s32 	%r49, %r43;

min.u32 	%r50, %r49, %r46;

mov.s32 	%r51, %r50;

.loc	28	19	0

max.u32 	%r52, %r49, %r46;

mov.s32 	%r53, %r52;

.loc	28	18	0

mov.s32 	%r54, %r45;

min.u32 	%r55, %r38, %r54;

mov.s32 	%r56, %r55;

.loc	28	19	0

max.u32 	%r57, %r38, %r54;

mov.s32 	%r58, %r57;

.loc	28	25	0

mov.s32 	%r59, %r56;

mov.s32 	%r60, %r53;

min.u32 	%r61, %r59, %r60;

mov.s32 	%r62, %r61;

.loc	28	26	0

max.u32 	%r63, %r59, %r60;

mov.s32 	%r64, %r63;

.loc	28	29	0

mov.s32 	%r65, %r51;

mov.s32 	%r66, %r65;

.loc	28	30	0

mov.s32 	%r67, %r57;

.loc	28	18	0

mov.s32 	%r68, %r62;

min.u32 	%r69, %r68, %r65;

mov.s32 	%r70, %r69;

.loc	28	19	0

max.u32 	%r71, %r68, %r65;

mov.s32 	%r72, %r71;

.loc	28	18	0

mov.s32 	%r73, %r64;

min.u32 	%r74, %r57, %r73;

mov.s32 	%r75, %r74;

.loc	28	19	0

max.u32 	%r76, %r57, %r73;

mov.s32 	%r77, %r76;

.loc	28	25	0

mov.s32 	%r78, %r75;

mov.s32 	%r79, %r72;

min.u32 	%r80, %r78, %r79;

mov.s32 	%r81, %r80;

.loc	28	26	0

max.u32 	%r82, %r78, %r79;

mov.s32 	%r83, %r82;

.loc	28	29	0

mov.s32 	%r84, %r70;

mov.s32 	%r85, %r84;

.loc	28	30	0

mov.s32 	%r86, %r76;

.loc	28	36	0

st.global.u32 	[%rd4+0], %r84;

mov.s32 	%r87, %r81;

st.global.u32 	[%rd4+1024], %r87;

mov.s32 	%r88, %r83;

st.global.u32 	[%rd4+2048], %r88;

st.global.u32 	[%rd4+3072], %r76;

.loc	28	38	0

exit;

$LDWend__Z4sortPA256_j:

} // _Z4sortPA256_j[/codebox]

The real code (not going to post it) has the same behavior but the data set is larger (but not large enough for warp level sorting to make sense and I would really like to avoid burning the extra instructions to do shared memory I/O). The kernel is definitely compute limited (about 75% of the kernel run-time is taken up by this kind of code) so these extra instructions are really dragging the performance down. At least I am pretty sure it is being dragged down, but I would like to confirm by actually getting this code working the way I want it to.

I have tried using decuda, but it looks like I would need to roll back to 2.3 in order to generate cubin files that are compatible with the disassembler. Can’t do that yet.

What am I missing here? A straight up bubble sort with a single buffer has no extra MOV instructions but each iteration takes 3-4 instructions rather than 2, which is not desirable.

The movs get optimized out when ptx is compiled to cubin.

To reassure you, I’ve just run your example through decuda. Here’s the ptx generated by my old CUDA 2.1 installation:

[codebox]

.entry _Z4sortPA256_j

{

.reg .u16 %rh<4>;

.reg .u32 %r<28>;

.reg .u64 %rd<6>;

.reg .pred %p<3>;

.param .u64 __cudaparm__Z4sortPA256_j_g_pixels;

// pixels = 0

// buf = 16

.loc    16  1   0

$LBB1__Z4sortPA256_j:

.loc    16  9   0

ld.param.u64    %rd1, [__cudaparm__Z4sortPA256_j_g_pixels]; // id:102 __cudaparm__Z4sortPA256_j_g_pixels+0x0

cvt.u32.u16     %r1, %tid.x;        //

cvt.u64.u32     %rd2, %r1;          //

mul.lo.u64  %rd3, %rd2, 4;      //

add.u64     %rd4, %rd1, %rd3;       //

ld.global.u32   %r2, [%rd4+0];  // id:104

mov.s32     %r3, %r2;               //

ld.global.u32   %r4, [%rd4+1024];   // id:105

mov.s32     %r5, %r4;               //

ld.global.u32   %r6, [%rd4+2048];   // id:106

mov.s32     %r7, %r6;               //

ld.global.u32   %r8, [%rd4+3072];   // id:107

mov.s32     %r9, %r8;               //

mov.u16     %rh1, 0;                //

mov.s32     %r10, %r3;              //

$Lt_0_23:

// Loop body line 9, nesting depth: 1, iterations: 4

.loc    16  18  0

mov.s32     %r11, %r5;              //

min.u32     %r12, %r11, %r10;       //

mov.s32     %r13, %r12;             //

.loc    16  19  0

max.u32     %r14, %r11, %r10;       //

mov.s32     %r15, %r14;             //

.loc    16  18  0

mov.s32     %r16, %r7;              //

min.u32     %r17, %r8, %r16;        //

mov.s32     %r18, %r17;             //

.loc    16  19  0

max.u32     %r19, %r8, %r16;        //

mov.s32     %r20, %r19;             //

.loc    16  25  0

mov.s32     %r21, %r18;             //

mov.s32     %r22, %r15;             //

min.u32     %r23, %r21, %r22;       //

mov.s32     %r5, %r23;              //

.loc    16  26  0

max.u32     %r24, %r21, %r22;       //

mov.s32     %r7, %r24;              //

.loc    16  29  0

mov.s32     %r10, %r13;             //

mov.s32     %r3, %r10;              //

.loc    16  30  0

mov.s32     %r8, %r19;              //

mov.s32     %r9, %r8;               //

add.u16     %rh1, %rh1, 1;          //

mov.u16     %rh2, 4;                //

setp.ne.u16     %p1, %rh1, %rh2;    //

@%p1 bra    $Lt_0_23;               //

.loc    16  36  0

st.global.u32   [%rd4+0], %r10; // id:110

mov.s32     %r25, %r5;              //

st.global.u32   [%rd4+1024], %r25;  // id:111

mov.s32     %r26, %r7;              //

st.global.u32   [%rd4+2048], %r26;  // id:112

st.global.u32   [%rd4+3072], %r19;  // id:113

.loc    16  38  0

exit;                           //

$LDWend__Z4sortPA256_j:

} // _Z4sortPA256_j

[/codebox]

and here is decuda’s disassembly:

// Disassembling _Z4sortPA256_j

000000: a0000001 04000780 cvt.rn.u32.u16 $r0, $r0.lo

000008: 30020001 c4100780 shl.u32 $r0, $r0, 0x00000002

000010: 2000c809 04200780 add.u32 $r2, s[0x0010], $r0

000018: 10000009 00000003 mov.b16 $r1.lo, 0x0000

000020: 20008401 00000043 add.b32 $r0, $r2, 0x00000400

000028: 20008411 00000083 add.b32 $r4, $r2, 0x00000800

000030: 20008419 000000c3 add.b32 $r6, $r2, 0x00000c00

000038: d00e0415 80c00780 mov.u32 $r5, g[$r2]

000040: d00e000d 80c00780 mov.u32 $r3, g[$r0]

000048: d00e0811 80c00780 mov.u32 $r4, g[$r4]

000050: d00e0c01 80c00780 mov.u32 $r0, g[$r6]

000058: 30050619 84000780 label0: max.u32 $r6, $r3, $r5

000060: 3004001d a4000780 min.u32 $r7, $r0, $r4

000068: 20010409 00000003 add.b16 $r1.lo, $r1.lo, 0x0001

000070: 30050615 a4000780 min.u32 $r5, $r3, $r5

000078: 30040001 84000780 max.u32 $r0, $r0, $r4

000080: 308005fd 604147c8 set.ne.u16 $p0|$o127, $r1.lo, c1[0x0000]

000088: 30060e0d a4000780 min.u32 $r3, $r7, $r6

000090: 30060e11 84000780 max.u32 $r4, $r7, $r6

000098: 1000b003 00000280 @$p0.ne bra.label label0

0000a0: d00e0415 a0c00780 mov.u32 g[$r2], $r5

0000a8: 20008405 00000043 add.b32 $r1, $r2, 0x00000400

0000b0: d00e020d a0c00780 mov.u32 g[$r1], $r3

0000b8: 20008405 00000083 add.b32 $r1, $r2, 0x00000800

0000c0: d00e0211 a0c00780 mov.u32 g[$r1], $r4

0000c8: 20008405 000000c3 add.b32 $r1, $r2, 0x00000c00

0000d0: d00e0201 a0c00781 mov.end.u32 g[$r1], $r0

// segment: const (1:0000)

0000: 00000004

Well that is annoying. Thanks for trying it (I’ll just have to set up a 2.x system). Any idea when decuda might work with 3.x?

Those extra add and set instructions are interesting though…I’m going to re-read that output over lunch. Wait…is the compiler optimizing out the loop unrolling, or did you just take that out?

Interesting find! I had blindly run your example, as-is, through nvcc and decuda.

Here’s what happens in current CUDA 3.1 - first the PTX again:

[codebox]

    .entry _Z4sortPA256_j (

            .param .u32 __cudaparm__Z4sortPA256_j_g_pixels)

    {

    .reg .u16 %rh<3>;

    .reg .u32 %r<92>;

    .loc    28      1       0

$LDWbegin__Z4sortPA256_j:

    .loc    28      9       0

    mov.u16         %rh1, %tid.x;

    mul.wide.u16    %r1, %rh1, 4;

    ld.param.u32    %r2, [__cudaparm__Z4sortPA256_j_g_pixels];

    add.u32         %r3, %r2, %r1;

    ld.global.u32   %r4, [%r3+0];

    mov.s32         %r5, %r4;

    ld.global.u32   %r6, [%r3+1024];

    mov.s32         %r7, %r6;

    ld.global.u32   %r8, [%r3+2048];

    mov.s32         %r9, %r8;

    ld.global.u32   %r10, [%r3+3072];

    mov.s32         %r11, %r10;

    .loc    28      18      0

    mov.s32         %r12, %r7;

    mov.s32         %r13, %r5;

    min.u32         %r14, %r12, %r13;

    mov.s32         %r15, %r14;

    .loc    28      19      0

    max.u32         %r16, %r12, %r13;

    mov.s32         %r17, %r16;

    .loc    28      18      0

    mov.s32         %r18, %r9;

    min.u32         %r19, %r10, %r18;

    mov.s32         %r20, %r19;

    .loc    28      19      0

    max.u32         %r21, %r10, %r18;

    mov.s32         %r22, %r21;

    .loc    28      25      0

    mov.s32         %r23, %r20;

    mov.s32         %r24, %r17;

    min.u32         %r25, %r23, %r24;

    mov.s32         %r26, %r25;

    .loc    28      26      0

    max.u32         %r27, %r23, %r24;

    mov.s32         %r28, %r27;

    .loc    28      29      0

    mov.s32         %r29, %r15;

    mov.s32         %r30, %r29;

    .loc    28      30      0

    mov.s32         %r31, %r21;

    .loc    28      18      0

    mov.s32         %r32, %r26;

    min.u32         %r33, %r32, %r29;

    mov.s32         %r34, %r33;

    .loc    28      19      0

    max.u32         %r35, %r32, %r29;

    mov.s32         %r36, %r35;

    .loc    28      18      0

    mov.s32         %r37, %r28;

    min.u32         %r38, %r21, %r37;

    mov.s32         %r39, %r38;

    .loc    28      19      0

    max.u32         %r40, %r21, %r37;

    mov.s32         %r41, %r40;

    .loc    28      25      0

    mov.s32         %r42, %r39;

    mov.s32         %r43, %r36;

    min.u32         %r44, %r42, %r43;

    mov.s32         %r45, %r44;

    .loc    28      26      0

    max.u32         %r46, %r42, %r43;

    mov.s32         %r47, %r46;

    .loc    28      29      0

    mov.s32         %r48, %r34;

    mov.s32         %r49, %r48;

    .loc    28      30      0

    mov.s32         %r50, %r40;

    .loc    28      18      0

    mov.s32         %r51, %r45;

    min.u32         %r52, %r51, %r48;

    mov.s32         %r53, %r52;

    .loc    28      19      0

    max.u32         %r54, %r51, %r48;

    mov.s32         %r55, %r54;

    .loc    28      18      0

    mov.s32         %r56, %r47;

    min.u32         %r57, %r40, %r56;

    mov.s32         %r58, %r57;

    .loc    28      19      0

    max.u32         %r59, %r40, %r56;

    mov.s32         %r60, %r59;

    .loc    28      25      0

    mov.s32         %r61, %r58;

    mov.s32         %r62, %r55;

    min.u32         %r63, %r61, %r62;

    mov.s32         %r64, %r63;

    .loc    28      26      0

    max.u32         %r65, %r61, %r62;

    mov.s32         %r66, %r65;

    .loc    28      29      0

    mov.s32         %r67, %r53;

    mov.s32         %r68, %r67;

    .loc    28      30      0

    mov.s32         %r69, %r59;

    .loc    28      18      0

    mov.s32         %r70, %r64;

    min.u32         %r71, %r70, %r67;

    mov.s32         %r72, %r71;

    .loc    28      19      0

    max.u32         %r73, %r70, %r67;

    mov.s32         %r74, %r73;

    .loc    28      18      0

    mov.s32         %r75, %r66;

    min.u32         %r76, %r59, %r75;

    mov.s32         %r77, %r76;

    .loc    28      19      0

    max.u32         %r78, %r59, %r75;

    mov.s32         %r79, %r78;

    .loc    28      25      0

    mov.s32         %r80, %r77;

    mov.s32         %r81, %r74;

    min.u32         %r82, %r80, %r81;

    mov.s32         %r83, %r82;

    .loc    28      26      0

    max.u32         %r84, %r80, %r81;

    mov.s32         %r85, %r84;

    .loc    28      29      0

    mov.s32         %r86, %r72;

    mov.s32         %r87, %r86;

    .loc    28      30      0

    mov.s32         %r88, %r78;

    .loc    28      36      0

    st.global.u32   [%r3+0], %r86;

    mov.s32         %r89, %r83;

    st.global.u32   [%r3+1024], %r89;

    mov.s32         %r90, %r85;

    st.global.u32   [%r3+2048], %r90;

    st.global.u32   [%r3+3072], %r78;

    .loc    28      38      0

    exit;

$LDWend__Z4sortPA256_j:

    } // _Z4sortPA256_j

[/codebox]

And here’s the disassembled .cubin (see this thread for info on how to get decuda to work with 3.0 or 3.1 cubins):

// Disassembling _Z4sortPA256_j

000000: 1000c805 0423c780 mov.b32 $r1, s[0x0010]

000008: 60800001 00404780 mad24.lo.u32.u16.u16.u32 $r0, $r0.lo, c1[0x0000], $r1

000010: 2000800d 00000043 add.b32 $r3, $r0, 0x00000400

000018: 20008009 00000083 add.b32 $r2, $r0, 0x00000800

000020: 20008005 000000c3 add.b32 $r1, $r0, 0x00000c00

000028: d00e0021 80c00780 mov.u32 $r8, g[$r0]

000030: d00e0605 80c00780 mov.u32 $r1, g[$r3]

000038: d00e0419 80c00780 mov.u32 $r6, g[$r2]

000040: d00e020d 80c00780 mov.u32 $r3, g[$r1]

000048: 30081215 84000780 max.u32 $r5, $r9, $r8

000050: 30060e01 a4000780 min.u32 $r0, $r7, $r6

000058: 30081225 a4000780 min.u32 $r9, $r9, $r8

000060: 30060e01 84000780 max.u32 $r0, $r7, $r6

000068: 30050819 a4000780 min.u32 $r6, $r4, $r5

000070: 3005080d 84000780 max.u32 $r3, $r4, $r5

000078: 30090c15 84000780 max.u32 $r5, $r6, $r9

000080: 30071001 a4000780 min.u32 $r0, $r8, $r7

000088: 30090c19 a4000780 min.u32 $r6, $r6, $r9

000090: 3007100d 84000780 max.u32 $r3, $r8, $r7

000098: 30050821 a4000780 min.u32 $r8, $r4, $r5

0000a0: 30050805 84000780 max.u32 $r1, $r4, $r5

0000a8: 30061011 84000780 max.u32 $r4, $r8, $r6

0000b0: 30090e05 a4000780 min.u32 $r1, $r7, $r9

0000b8: 30061021 a4000780 min.u32 $r8, $r8, $r6

0000c0: 30040a09 a4000780 min.u32 $r2, $r5, $r4

0000c8: 30090e1d 84000780 max.u32 $r7, $r7, $r9

0000d0: 30040a01 84000780 max.u32 $r0, $r5, $r4

0000d8: 30080c15 a4000780 min.u32 $r5, $r6, $r8

0000e0: 30080c01 84000780 max.u32 $r0, $r6, $r8

0000e8: 30040e19 a4000780 min.u32 $r6, $r7, $r4

0000f0: d00e0005 a0c00780 mov.u32 g[$r0], $r1

0000f8: 30080c01 a4000780 min.u32 $r0, $r6, $r8

000100: 30080c05 84000780 max.u32 $r1, $r6, $r8

000108: d00e0601 a0c00780 mov.u32 g[$r3], $r0

000110: 30040e01 84000780 max.u32 $r0, $r7, $r4

000118: d00e0415 a0c00780 mov.u32 g[$r2], $r5

000120: d00e0201 a0c00781 mov.end.u32 g[$r1], $r0

// segment: const (0:0000)

0000: 00000004

So CUDA 3.1 really unrolls the loops without recombining them into one large loop afterwards.

And the movs are still gone. :)

Yup, the MOVs are still gone and you now know a nifty trick to use arrays and force them to stay in register space ;).

Looks like this algorithm is bust then…

As for decuda, I tried that new tool you pointed me towards and decuda is still crashing. (EDIT: Got it working, but only for SM1.x cubin files)

You can use the disassembler from the Nouveau project for sm_20 cubins.

Why is the algorithm bust?

It is not bust because it does not work, it is busted because it is not substantially faster than the original code I am trying to optimize.

The only way to beat this is to move to an nlog(n) algorithm I think. But to do it without warp divergence…yum! Something tells me bitonic sort might work.

From that example it looks like you are doing some variation of sorting. At least the generic version of sorting is one of the most studied problems for CUDA. This paper covers the state of the art: [url=“http://www.cs.virginia.edu/~dgm4d/papers/RadixSortTR.pdf”]http://www.cs.virginia.edu/~dgm4d/papers/RadixSortTR.pdf[/url] . I would start there if you haven’t already seen it.

I’ll take a look, thanks.

The fun part of this is that I need to sort a set of 8-16 integers, but there are about 1-2M sets of these little buggers to do. So at least right now it makes sense to work on sorting algorithms where each thread sorts a single set of inputs using sorting methods that cause no warp divergence (i.e. bubble sort, even-odd sort).

Since you are only sorting four values, how about this?

__global__ void sort (unsigned int g_pixels[4][256])

{

	register unsigned int pixels[4];

	register unsigned int buf[4];

	#pragma unroll

	for (unsigned int j = 0; j < 4; j++)

	{

		pixels[j] = g_pixels[j][threadIdx.x];

	}

	#pragma unroll

	for (unsigned int i = 0; i < 4; i+=2)

	{

		buf[i] = min(pixels[i], pixels[i+1]);

		buf[i+1] = max(pixels[i], pixels[i+1]);

	}

	#pragma unroll

	for (unsigned int i = 0; i < 2; i++)

	{

		pixels[i] = min(buf[i], buf[i+2]);

		pixels[i+2] = max(buf[i], buf[i+2]);

	}

	buf[1] = min(pixels[1], pixels[2]);

	buf[2] = max(pixels[1], pixels[2]);

	pixels[1] = buf[1];

	pixels[2] = buf[2];

	#pragma unroll

	for (unsigned int j = 0; j < 4; j++)

	{

		g_pixels[j][threadIdx.x] = pixels[j];

	}

}

Yup, that works for 4 elements. I’ve got working code for 8 too, but this is only sample code.

N is between 3 and 16 inclusive in the real code, so right now I’m stuck hand coding the sorting networks. Not that big of a deal at all, but I’m relatively new to bitonic sort so I am taking my time to really understand it while I am at it.

Bitonic sort takes about 75% of the run time of bubble sort with n=8 which is great. That 75% includes the overhead for global memory transfers, so the actual compute is much faster in reality. Once I get the bigger N’s all figured out I’m sure there will be an even bigger improvement.

It’s funny though that, at least in this case, removing the double buffer and just making code that runs with a temp variable or two is equally as fast.