How to workaround suboptimal code for __byte_perm intrinsic?

I have a device function where I would like to use __byte_perm intrinsic like this:

#if CUDA_DEVICE_ARCH >= 200

    unsigned int lo = __byte_perm(thread_staging[4], thread_staging[5], 0x0411) | __byte_perm(thread_staging[6], thread_staging[7], 0x01104);

    unsigned int hi = __byte_perm(thread_staging[0], thread_staging[1], 0x0411) | __byte_perm(thread_staging[2], thread_staging[3], 0x01104);

#else

    unsigned int lo = (thread_staging[4] << 24) | (thread_staging[5] << 16) | (thread_staging[6] << 8) | thread_staging[7];

    unsigned int hi = (thread_staging[0] << 24) | (thread_staging[1] << 16) | (thread_staging[2] << 8) | thread_staging[3];

#endif

Unfortunate compiler generates suboptimal code for this intrinsic:

ld.shared.u32   %r15, [%rd4+16];

ld.shared.u32   %r16, [%rd4+20];

ld.shared.u32   %r17, [%rd4+24];

ld.shared.u32   %r18, [%rd4+28];

ld.shared.u32   %r19, [%rd4+0];

ld.shared.u32   %r20, [%rd4+4];

ld.shared.u32   %r21, [%rd4+8];

ld.shared.u32   %r22, [%rd4+12];

mov.u32         %r23, 1041;

and.b32         %r24, %r23, 30583;

prmt.b32        %r25, %r15, %r16, %r24;

mov.u32         %r26, 4356;

and.b32         %r27, %r26, 30583;

prmt.b32        %r28, %r17, %r18, %r27;

or.b32          %r29, %r25, %r28;

mov.u32         %r30, 1041;

and.b32         %r31, %r30, 30583;

prmt.b32        %r32, %r19, %r20, %r31;

mov.u32         %r33, 4356;

and.b32         %r34, %r33, 30583;

prmt.b32        %r35, %r21, %r22, %r34;

or.b32          %r36, %r32, %r35;

So I decide to use inline asm and force compiler to generate better code by using this function:

__device__ __forceinline__ static

unsigned int byte_perm(unsigned int a, unsigned int b, unsigned int slct) 

{

#if CUDA_DEVICE_ARCH >= 200

    unsigned int result; asm("prmt.b32 %0, %1, %2, %3;" : "=r"(result) : "r"(a), "r"(b), "r"(slct)); return result;

#else

    return __byte_perm(a, b, slct);

#endif

}

Unfortunately it make code even worse:

ld.shared.u32 	%r15, [%rd4+16];

mov.u32 	%r16, %r15;

ld.shared.u32 	%r17, [%rd4+20];

mov.u32 	%r18, %r17;

mov.u32 	%r19, 1041;

mov.u32 	%r20, %r19;

prmt.b32        %r21, %r16, %r18, %r20;

mov.s32 	%r22, %r21;

ld.shared.u32 	%r23, [%rd4+24];

mov.u32 	%r24, %r23;

ld.shared.u32 	%r25, [%rd4+28];

mov.u32 	%r26, %r25;

mov.u32 	%r27, 4356;

mov.u32 	%r28, %r27;

prmt.b32        %r29, %r24, %r26, %r28;

mov.s32 	%r30, %r29;

or.b32 	        %r31, %r22, %r30;

ld.shared.u32 	%r32, [%rd4+0];

mov.u32 	%r33, %r32;

ld.shared.u32 	%r34, [%rd4+4];

mov.u32 	%r35, %r34;

mov.u32 	%r36, 1041;

mov.u32 	%r37, %r36;

prmt.b32        %r38, %r33, %r35, %r37;

mov.s32 	%r39, %r38;

ld.shared.u32 	%r40, [%rd4+8];

mov.u32 	%r41, %r40;

ld.shared.u32 	%r42, [%rd4+12];

mov.u32 	%r43, %r42;

mov.u32 	%r44, 4356;

mov.u32 	%r45, %r44;

prmt.b32        %r46, %r41, %r43, %r45;

mov.s32 	%r47, %r46;

So what I am doing wrong? How to avoid unnecessary register to register copy?

The PTX code you are looking at is just an intermediate representation that is not fully optimized. In particular many low-level optimizations, plus instruction scheduling and register allocation are performed by PTXAS when it compiles the PTX code to machine code. You can examine the generated machine code with cuobjdump. The machine code should look a lot closer to what you are expecting.

For combining non-overlapping bit fields on Fermi when using left shift with immediate shift count, instead of using left shift plus OR, try left shift plus ADD. This typically allows the compiler to combine the immediate left shift and the ADD into a scaled add (the ISCADD instruction).

Thanks. Which instruction has higher throughput? ISCADD or PRMT?

Sorry, I don’t know. I would suggest timing at app level with both version to see whether there is any significant difference. The version with shift plus ADD would have the advantage of being applicable to all CUDA platforms (__byte_perm() also works on pre-Fermi platform, but only as a rather slow emulation).

Tested on 560 Ti. PRMT is about 30% faster than ISCADD on my test cases.

Thanks for the feedback, it is good to know that exposing Fermi’s PRMT instruction via the __byte_perm() intrinsic is helpful in increasing the performance of real-life applications.