I have a device function where I would like to use __byte_perm intrinsic like this:
#if CUDA_DEVICE_ARCH >= 200
unsigned int lo = __byte_perm(thread_staging[4], thread_staging[5], 0x0411) | __byte_perm(thread_staging[6], thread_staging[7], 0x01104);
unsigned int hi = __byte_perm(thread_staging[0], thread_staging[1], 0x0411) | __byte_perm(thread_staging[2], thread_staging[3], 0x01104);
#else
unsigned int lo = (thread_staging[4] << 24) | (thread_staging[5] << 16) | (thread_staging[6] << 8) | thread_staging[7];
unsigned int hi = (thread_staging[0] << 24) | (thread_staging[1] << 16) | (thread_staging[2] << 8) | thread_staging[3];
#endif
Unfortunate compiler generates suboptimal code for this intrinsic:
ld.shared.u32 %r15, [%rd4+16];
ld.shared.u32 %r16, [%rd4+20];
ld.shared.u32 %r17, [%rd4+24];
ld.shared.u32 %r18, [%rd4+28];
ld.shared.u32 %r19, [%rd4+0];
ld.shared.u32 %r20, [%rd4+4];
ld.shared.u32 %r21, [%rd4+8];
ld.shared.u32 %r22, [%rd4+12];
mov.u32 %r23, 1041;
and.b32 %r24, %r23, 30583;
prmt.b32 %r25, %r15, %r16, %r24;
mov.u32 %r26, 4356;
and.b32 %r27, %r26, 30583;
prmt.b32 %r28, %r17, %r18, %r27;
or.b32 %r29, %r25, %r28;
mov.u32 %r30, 1041;
and.b32 %r31, %r30, 30583;
prmt.b32 %r32, %r19, %r20, %r31;
mov.u32 %r33, 4356;
and.b32 %r34, %r33, 30583;
prmt.b32 %r35, %r21, %r22, %r34;
or.b32 %r36, %r32, %r35;
So I decide to use inline asm and force compiler to generate better code by using this function:
__device__ __forceinline__ static
unsigned int byte_perm(unsigned int a, unsigned int b, unsigned int slct)
{
#if CUDA_DEVICE_ARCH >= 200
unsigned int result; asm("prmt.b32 %0, %1, %2, %3;" : "=r"(result) : "r"(a), "r"(b), "r"(slct)); return result;
#else
return __byte_perm(a, b, slct);
#endif
}
Unfortunately it make code even worse:
ld.shared.u32 %r15, [%rd4+16];
mov.u32 %r16, %r15;
ld.shared.u32 %r17, [%rd4+20];
mov.u32 %r18, %r17;
mov.u32 %r19, 1041;
mov.u32 %r20, %r19;
prmt.b32 %r21, %r16, %r18, %r20;
mov.s32 %r22, %r21;
ld.shared.u32 %r23, [%rd4+24];
mov.u32 %r24, %r23;
ld.shared.u32 %r25, [%rd4+28];
mov.u32 %r26, %r25;
mov.u32 %r27, 4356;
mov.u32 %r28, %r27;
prmt.b32 %r29, %r24, %r26, %r28;
mov.s32 %r30, %r29;
or.b32 %r31, %r22, %r30;
ld.shared.u32 %r32, [%rd4+0];
mov.u32 %r33, %r32;
ld.shared.u32 %r34, [%rd4+4];
mov.u32 %r35, %r34;
mov.u32 %r36, 1041;
mov.u32 %r37, %r36;
prmt.b32 %r38, %r33, %r35, %r37;
mov.s32 %r39, %r38;
ld.shared.u32 %r40, [%rd4+8];
mov.u32 %r41, %r40;
ld.shared.u32 %r42, [%rd4+12];
mov.u32 %r43, %r42;
mov.u32 %r44, 4356;
mov.u32 %r45, %r44;
prmt.b32 %r46, %r41, %r43, %r45;
mov.s32 %r47, %r46;
So what I am doing wrong? How to avoid unnecessary register to register copy?