A simple test, attempting to “read” code, doesn’t seem to work for me:
# cat t138.cu
#include <cstdio>
__device__ void foo(int &a){a += 13;}
__device__ void (*myfp)(int &a) = foo;
__global__ void k(int val) {
#ifdef TEST
unsigned char *mycode = (unsigned char *)myfp;
for (int i = 0; i < 1; i++) printf("%u ", mycode[i]);
#endif
printf("\n");
myfp(val);
printf("val = %d\n", val);
}
int main(){
k<<<1,1>>>(4);
cudaDeviceSynchronize();
}
# nvcc -o t138 t138.cu -rdc=true
# compute-sanitizer ./t138
========= COMPUTE-SANITIZER
val = 17
========= ERROR SUMMARY: 0 errors
# nvcc -o t138 t138.cu -rdc=true -DTEST
# compute-sanitizer ./t138
========= COMPUTE-SANITIZER
========= Invalid __global__ read of size 1 bytes
========= at 0xa0 in k(int)
========= by thread (0,0,0) in block (0,0,0)
========= Address 0x7fcc0fc5d100 is out of bounds
========= and is 54,906,361 bytes after the nearest allocation at 0x7fcc0c800300 of size 8 bytes
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame: [0x32e950]
========= in /lib/x86_64-linux-gnu/libcuda.so.1
========= Host Frame:libcudart_static_4d8b33a106dceb3c07a56e26de61f2d53bb62a68 [0x1074e]
========= in /root/bobc/./t138
========= Host Frame:cudaLaunchKernel [0x7095e]
========= in /root/bobc/./t138
========= Host Frame:cudaError cudaLaunchKernel<char>(char const*, dim3, dim3, void**, unsigned long, CUstream_st*) [0xb0ca]
========= in /root/bobc/./t138
========= Host Frame:__device_stub__Z1ki(int) [0xaf70]
========= in /root/bobc/./t138
========= Host Frame:k(int) [0xafa3]
========= in /root/bobc/./t138
========= Host Frame:main [0xade5]
========= in /root/bobc/./t138
========= Host Frame: [0x29d90]
========= in /lib/x86_64-linux-gnu/libc.so.6
========= Host Frame:__libc_start_main [0x29e40]
========= in /lib/x86_64-linux-gnu/libc.so.6
========= Host Frame:_start [0xaad5]
========= in /root/bobc/./t138
=========
========= Program hit cudaErrorLaunchFailure (error 719) due to "unspecified launch failure" on CUDA API call to cudaDeviceSynchronize.
========= Saved host backtrace up to driver entry point at error
========= Host Frame: [0x47e786]
========= in /lib/x86_64-linux-gnu/libcuda.so.1
========= Host Frame:cudaDeviceSynchronize [0x48874]
========= in /root/bobc/./t138
========= Host Frame:main [0xadea]
========= in /root/bobc/./t138
========= Host Frame: [0x29d90]
========= in /lib/x86_64-linux-gnu/libc.so.6
========= Host Frame:__libc_start_main [0x29e40]
========= in /lib/x86_64-linux-gnu/libc.so.6
========= Host Frame:_start [0xaad5]
========= in /root/bobc/./t138
=========
========= ERROR SUMMARY: 2 errors
# cuobjdump -sass ./t138
Fatbin elf code:
================
arch = sm_52
code version = [1,7]
host = linux
compile_size = 64bit
code for sm_52
Function : _Z3fooRi
.headerflags @"EF_CUDA_TEXMODE_UNIFIED EF_CUDA_64BIT_ADDRESS EF_CUDA_SM52 EF_CUDA_VIRTUAL_SM(EF_CUDA_SM52)"
/* 0x001ec800fda207f6 */
/*0008*/ LEA R6.CC, R4.reuse, RZ ; /* 0x5bd780000ff70406 */
/*0010*/ LEA.HI.X P0, R7, R4, RZ, R5 ; /* 0x5bd802c00ff70407 */
/*0018*/ LD.E R0, [R6], P0 ; /* 0x8090000000070600 */
/* 0x003ffc001fa107f4 */
/*0028*/ IADD32I R0, R0, 0xd ; /* 0x1c00000000d70000 */
/*0030*/ ST.E [R6], R0, P0 ; /* 0xa090000000070600 */
/*0038*/ RET ; /* 0xe32000000007000f */
/* 0x001f8000fc0007ff */
/*0048*/ BRA 0x40 ; /* 0xe2400fffff07000f */
/*0050*/ NOP; /* 0x50b0000000070f00 */
/*0058*/ NOP; /* 0x50b0000000070f00 */
/* 0x001f8000fc0007e0 */
/*0068*/ NOP; /* 0x50b0000000070f00 */
/*0070*/ NOP; /* 0x50b0000000070f00 */
/*0078*/ NOP; /* 0x50b0000000070f00 */
..........
Function : _Z1ki
.headerflags @"EF_CUDA_TEXMODE_UNIFIED EF_CUDA_64BIT_ADDRESS EF_CUDA_SM52 EF_CUDA_VIRTUAL_SM(EF_CUDA_SM52)"
/* 0x001fc800fe2007f6 */
/*0008*/ MOV R1, c[0x0][0x20] ; /* 0x4c98078000870001 */
/*0010*/ MOV32I R18, 0x0 ; /* 0x010000000007f012 */
/*0018*/ MOV32I R19, 0x0 ; /* 0x010000000007f013 */
/* 0x0c1fd800f64007f0 */
/*0028*/ IADD32I R1, R1, -0x10 ; /* 0x1c0fffffff070101 */
/*0030*/ LDG.E.64 R2, [R18] ; /* 0xeed5200000071202 */
/*0038*/ LEA R8.CC, R2.reuse, RZ ; /* 0x5bd780000ff70208 */
/* 0x0003e800fc4007f1 */
/*0048*/ LEA.HI.X P0, R9, R2, RZ, R3 ; /* 0x5bd801c00ff70209 */
/*0050*/ MOV R3, c[0x0][0x140] ; /* 0x4c98078005070003 */
/*0058*/ STL [R1], R3 ; /* 0xef54000000070103 */
/* 0x001fc400fe2007b1 */
/*0068*/ LD.E.U8 R0, [R8], P0 ; /* 0x8010000000070800 */
/*0070*/ LOP.OR R2, R1, c[0x0][0x4] ; /* 0x4c47020000170102 */
/*0078*/ MOV32I R4, 0x0 ; /* 0x010000000007f004 */
/* 0x001fc400fcc007f4 */
/*0088*/ MOV32I R5, 0x0 ; /* 0x010000000007f005 */
/*0090*/ IADD32I R17.CC, R2, 0x8 ; /* 0x1c10000000870211 */
/*0098*/ IADD.X R16, RZ, RZ ; /* 0x5c1008000ff7ff10 */
/* 0x0403d400fe2007f5 */
/*00a8*/ MOV R6, R17 ; /* 0x5c98078001170006 */
/*00b0*/ MOV R7, R16 ; /* 0x5c98078001070007 */
/*00b8*/ STL [R1+0x8], R0 ; /* 0xef54000000870100 */
/* 0x001fc400fe200ffd */
/*00c8*/ JCAL 0x0 ; /* 0xe220000000000040 */
/*00d0*/ MOV32I R4, 0x0 ; /* 0x010000000007f004 */
/*00d8*/ MOV32I R5, 0x0 ; /* 0x010000000007f005 */
/* 0x001ff400fec007f1 */
/*00e8*/ MOV R6, RZ ; /* 0x5c9807800ff70006 */
/*00f0*/ MOV R7, RZ ; /* 0x5c9807800ff70007 */
/*00f8*/ JCAL 0x0 ; /* 0xe220000000000040 */
/* 0x001fd800162007f0 */
/*0108*/ MOV R4, R2 ; /* 0x5c98078000270004 */
/*0110*/ LDG.E R18, [R18] ; /* 0xeed4200000071212 */
/*0118*/ MOV R5, RZ ; /* 0x5c9807800ff70005 */
/* 0x001fc021ffa007f1 */
/*0128*/ PRET 0x138 ; /* 0xe270000000800040 */
/*0130*/ JMX R18 ; /* 0xe20000000007120f */
/*0138*/ MOV R6, R17 ; /* 0x5c98078001170006 */
/* 0x001fc400fe2007b1 */
/*0148*/ LDL R0, [R1] ; /* 0xef44000000070100 */
/*0150*/ MOV R7, R16 ; /* 0x5c98078001070007 */
/*0158*/ MOV32I R4, 0x0 ; /* 0x010000000007f004 */
/* 0x003ff4201ea007f1 */
/*0168*/ MOV32I R5, 0x0 ; /* 0x010000000007f005 */
/*0170*/ STL [R1+0x8], R0 ; /* 0xef54000000870100 */
/*0178*/ JCAL 0x0 ; /* 0xe220000000000040 */
/* 0x001f8000ffe007ff */
/*0188*/ EXIT ; /* 0xe30000000007000f */
/*0190*/ BRA 0x190 ; /* 0xe2400fffff87000f */
/*0198*/ NOP; /* 0x50b0000000070f00 */
/* 0x001f8000fc0007e0 */
/*01a8*/ NOP; /* 0x50b0000000070f00 */
/*01b0*/ NOP; /* 0x50b0000000070f00 */
/*01b8*/ NOP; /* 0x50b0000000070f00 */
..........
It’s also not obvious to me that one SASS instruction universally has a size of 16 bytes, although I acknowledge it might be possible to express every SASS instruction in a 16-byte field.