You tell me:
$ cat t74.cu
__device__
constexpr unsigned char getBitMaskFunc(unsigned char i, unsigned char j) {
unsigned char BIT_MASK[4][8] = {
{ 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 }, // 1 bit
{ 0x03, 0x0C, 0x30, 0xC0, 0x00, 0x00, 0x00, 0x00 }, // 2 bit
{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, // Nan
{ 0x0F, 0xF0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 } // 4 bit
};
return BIT_MASK[i][j];
}
__global__ void k(unsigned char *d){
*d = getBitMaskFunc(threadIdx.x,0);
}
__constant__
static const unsigned char BIT_MASK[4][8] = {
{ 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 }, // 1 bit
{ 0x03, 0x0C, 0x30, 0xC0, 0x00, 0x00, 0x00, 0x00 }, // 2 bit
{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, // Nan
{ 0x0F, 0xF0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 } // 4 bit
};
__global__ void j(unsigned char *d){
*d = BIT_MASK[threadIdx.x][0];
}
int main(){
unsigned char *data = NULL;
k<<<1,1>>>(data);
cudaDeviceSynchronize();
j<<<1,1>>>(data);
cudaDeviceSynchronize();
}
$ nvcc -o t74 t74.cu -std=c++17
$ cuobjdump -sass ./t74
Fatbin elf code:
================
arch = sm_52
code version = [1,7]
producer = <unknown>
host = linux
compile_size = 64bit
code for sm_52
Fatbin elf code:
================
arch = sm_52
code version = [1,7]
producer = <unknown>
host = linux
compile_size = 64bit
code for sm_52
Function : _Z1jPh
.headerflags @"EF_CUDA_SM52 EF_CUDA_PTX_SM(EF_CUDA_SM52)"
/* 0x001c4400fe0007f6 */
/*0008*/ MOV R1, c[0x0][0x20] ; /* 0x4c98078000870001 */
/*0010*/ { MOV R2, c[0x0][0x140] ; /* 0x4c98078005070002 */
/*0018*/ S2R R0, SR_TID.X }
/* 0xf0c8000002170000 */
/* 0x001c7c01fe8007ff */
/*0028*/ MOV R3, c[0x0][0x144] ; /* 0x4c98078005170003 */
/*0030*/ SHL R0, R0, 0x3 ; /* 0x3848000000370000 */
/*0038*/ LDC.U8 R0, c[0x3][R0] ; /* 0xef90003000070000 */
/* 0x001ffc00fce00ff1 */
/*0048*/ STG.E.U8 [R2], R0 ; /* 0xeed8200000070200 */
/*0050*/ NOP ; /* 0x50b0000000070f00 */
/*0058*/ EXIT ; /* 0xe30000000007000f */
/* 0x001f8000fc0007ff */
/*0068*/ BRA 0x60 ; /* 0xe2400fffff07000f */
/*0070*/ NOP; /* 0x50b0000000070f00 */
/*0078*/ NOP; /* 0x50b0000000070f00 */
..........
Function : _Z1kPh
.headerflags @"EF_CUDA_SM52 EF_CUDA_PTX_SM(EF_CUDA_SM52)"
/* 0x001c4400fe0007f6 */
/*0008*/ MOV R1, c[0x0][0x20] ; /* 0x4c98078000870001 */
/*0010*/ { IADD32I R1, R1, -0x20 ; /* 0x1c0ffffffe070101 */
/*0018*/ S2R R0, SR_TID.X }
/* 0xf0c8000002170000 */
/* 0x001fc400fe2007f1 */
/*0028*/ MOV32I R4, 0x8040201 ; /* 0x010080402017f004 */
/*0030*/ MOV32I R5, 0x80402010 ; /* 0x010804020107f005 */
/*0038*/ MOV32I R6, 0xc0300c03 ; /* 0x010c0300c037f006 */
/* 0x001fc400fe0007f2 */
/*0048*/ MOV R7, RZ ; /* 0x5c9807800ff70007 */
/*0050*/ { MOV32I R10, 0xf00f ; /* 0x0100000f00f7f00a */
/*0058*/ STL.128 [R1], R4 }
/* 0xef56000000070104 */
/* 0x001fc800fe2007e1 */
/*0068*/ MOV R8, RZ ; /* 0x5c9807800ff70008 */
/*0070*/ MOV R9, RZ ; /* 0x5c9807800ff70009 */
/*0078*/ MOV R11, RZ ; /* 0x5c9807800ff7000b */
/* 0x001fc801fec007fd */
/*0088*/ STL.128 [R1+0x10], R8 ; /* 0xef56000001070108 */
/*0090*/ LOP32I.AND R0, R0, 0xff ; /* 0x040000000ff70000 */
/*0098*/ LEA R0, R0, R1, 0x3 ; /* 0x5bd7018000170000 */
/* 0x001fc800fc2007b1 */
/*00a8*/ LDL.U8 R0, [R0] ; /* 0xef40000000070000 */
/*00b0*/ MOV R2, c[0x0][0x140] ; /* 0x4c98078005070002 */
/*00b8*/ MOV R3, c[0x0][0x144] ; /* 0x4c98078005170003 */
/* 0x001ffc00fda107f1 */
/*00c8*/ STG.E.U8 [R2], R0 ; /* 0xeed8200000070200 */
/*00d0*/ NOP ; /* 0x50b0000000070f00 */
/*00d8*/ EXIT ; /* 0xe30000000007000f */
/* 0x001f8000fc0007ff */
/*00e8*/ BRA 0xe0 ; /* 0xe2400fffff07000f */
/*00f0*/ NOP; /* 0x50b0000000070f00 */
/*00f8*/ NOP; /* 0x50b0000000070f00 */
..........
Fatbin ptx code:
================
arch = sm_52
code version = [7,1]
producer = <unknown>
host = linux
compile_size = 64bit
compressed
$