Thanks for the pointer. Yeah, that should work just fine. By the way, is there a reason __dp4a
is not mentioned anywhere in the CUDA Programming Guide? I wanted to refresh my memory on how exactly __dp4a
is specified, and could not find it when searching the latest PDF.
I infer from the PTX documentation that passing int32_t
(or int
) indicates that we are passing char4
whose elements should be sign extended while passing uint32_t
(or unsigned int
) indicates we are passing uchar4
whose elements should be zero extended?
[Later:]
Looking at the relevant header file (sm_61_intrinsics.h
), I see that there is
int __dp4a (int, int, int)
int __dp4a (char4, char4, int)
for the signed operation, and
unsigned int __dp4a (unsigned int, unsigned int, unsigned int)
unsigned int __dp4a (uchar4, uchar4, unsigned int)
for the unsigned operation.
While the code generated for __dp4a()
with int
arguments looks as expected, the code generated for __dp4a()
with char4
arguments looks needlessly complex. I would have expected the generated code to look the same. I wonder what is going on there?
My little test app, compiled on CUDA 11.1:
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#define USE_INT (0)
#if USE_INT
__global__ void kernel (int32_t a, int32_t b, int32_t *res)
#else // USE_INT
__global__ void kernel (char4 a, char4 b, int32_t *res)
#endif // USE_INT
{
*res = __dp4a (a, b, 0);
}
int main (void)
{
#if USE_INT
int32_t a = 0xfcfdfeff;
int32_t b = 0x01010101;
#else // USE_INT
char4 a = {-1, -2, -3, -4};
char4 b = {1, 1, 1, 1};
#endif // USE_INT
int32_t res, *res_d = 0;
cudaMalloc ((void**)&res_d, sizeof res_d[0]);
kernel<<<1,1>>>(a, b, res_d);
cudaMemcpy (&res, res_d, sizeof res, cudaMemcpyDeviceToHost);
printf ("sum = %d\n", res);
return EXIT_SUCCESS;
}
With USE_INT = 1
, the code generated for an sm_61
target looks as expected:
code for sm_61
Function : _Z6kerneliiPi
.headerflags @"EF_CUDA_SM61 EF_CUDA_PTX_SM(EF_CUDA_SM61)"
/*0008*/ MOV R1, c[0x0][0x20] ;
/*0010*/ MOV R0, c[0x0][0x140] ;
/*0018*/ MOV R2, c[0x0][0x148] ;
/*0028*/ MOV R3, c[0x0][0x14c] ;
/*0030*/ IDP.4A.S8.S8 R0, R0, c[0x0][0x144], RZ ;
/*0038*/ STG.E [R2], R0 ;
/*0048*/ NOP ;
/*0050*/ NOP ;
/*0058*/ NOP ;
/*0068*/ EXIT ;
/*0070*/ BRA 0x70 ;
/*0078*/ NOP;
..........
I would have expected the code generated for USE_INT = 0
to look identical, but instead got this monstrosity:
code for sm_61
Function : _Z6kernel5char4S_Pi
.headerflags @"EF_CUDA_SM61 EF_CUDA_PTX_SM(EF_CUDA_SM61)"
/*0008*/ MOV R1, c[0x0][0x20] ;
/*0010*/ LDC.U8 R3, c[0x0][0x142] ;
/*0018*/ LDC.U8 R6, c[0x0][0x146] ;
/*0028*/ LDC.U8 R0, c[0x0][0x143] ;
/*0030*/ LDC.U8 R7, c[0x0][0x141] ;
/*0038*/ LDC.U8 R4, c[0x0][0x147] ;
/*0048*/ LDC.U8 R8, c[0x0][0x145] ;
/*0050*/ MOV R2, c[0x0][0x140] ;
/*0058*/ MOV R5, c[0x0][0x144] ;
/*0068*/ LOP32I.AND R2, R2, 0xff ;
/*0070*/ LOP32I.AND R5, R5, 0xff ;
/*0078*/ PRMT R2, R3, 0x7054, R2 ;
/*0088*/ PRMT R5, R6, 0x7054, R5 ;
/*0090*/ SHL R3, R8, 0x8 ;
/*0098*/ PRMT R2, R0, 0x654, R2 ;
/*00a8*/ SHL R0, R7, 0x8 ;
/*00b0*/ PRMT R4, R4, 0x654, R5 ;
/*00b8*/ LOP.OR R0, R2, R0 ;
/*00c8*/ LOP.OR R5, R4, R3 ;
/*00d0*/ MOV R2, c[0x0][0x148] ;
/*00d8*/ MOV R3, c[0x0][0x14c] ;
/*00e8*/ IDP.4A.S8.S8 R0, R0, R5, RZ ;
/*00f0*/ STG.E [R2], R0 ;
/*00f8*/ NOP ;
/*0108*/ EXIT ;
/*0110*/ BRA 0x110 ;
/*0118*/ NOP;
/*0128*/ NOP;
/*0130*/ NOP;
/*0138*/ NOP;
..........