Ptxas slow

A simple test, attempting to “read” code, doesn’t seem to work for me:

# cat t138.cu
#include <cstdio>

__device__ void foo(int &a){a += 13;}

__device__ void (*myfp)(int &a) = foo;

__global__ void k(int val) {
#ifdef TEST
  unsigned char *mycode = (unsigned char *)myfp;
  for (int i = 0; i < 1; i++) printf("%u ", mycode[i]);
#endif
  printf("\n");
  myfp(val);
  printf("val = %d\n", val);
}

int main(){

  k<<<1,1>>>(4);
  cudaDeviceSynchronize();
}
# nvcc -o t138 t138.cu -rdc=true
# compute-sanitizer ./t138
========= COMPUTE-SANITIZER

val = 17
========= ERROR SUMMARY: 0 errors
# nvcc -o t138 t138.cu -rdc=true -DTEST
# compute-sanitizer ./t138
========= COMPUTE-SANITIZER
========= Invalid __global__ read of size 1 bytes
=========     at 0xa0 in k(int)
=========     by thread (0,0,0) in block (0,0,0)
=========     Address 0x7fcc0fc5d100 is out of bounds
=========     and is 54,906,361 bytes after the nearest allocation at 0x7fcc0c800300 of size 8 bytes
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame: [0x32e950]
=========                in /lib/x86_64-linux-gnu/libcuda.so.1
=========     Host Frame:libcudart_static_4d8b33a106dceb3c07a56e26de61f2d53bb62a68 [0x1074e]
=========                in /root/bobc/./t138
=========     Host Frame:cudaLaunchKernel [0x7095e]
=========                in /root/bobc/./t138
=========     Host Frame:cudaError cudaLaunchKernel<char>(char const*, dim3, dim3, void**, unsigned long, CUstream_st*) [0xb0ca]
=========                in /root/bobc/./t138
=========     Host Frame:__device_stub__Z1ki(int) [0xaf70]
=========                in /root/bobc/./t138
=========     Host Frame:k(int) [0xafa3]
=========                in /root/bobc/./t138
=========     Host Frame:main [0xade5]
=========                in /root/bobc/./t138
=========     Host Frame: [0x29d90]
=========                in /lib/x86_64-linux-gnu/libc.so.6
=========     Host Frame:__libc_start_main [0x29e40]
=========                in /lib/x86_64-linux-gnu/libc.so.6
=========     Host Frame:_start [0xaad5]
=========                in /root/bobc/./t138
=========
========= Program hit cudaErrorLaunchFailure (error 719) due to "unspecified launch failure" on CUDA API call to cudaDeviceSynchronize.
=========     Saved host backtrace up to driver entry point at error
=========     Host Frame: [0x47e786]
=========                in /lib/x86_64-linux-gnu/libcuda.so.1
=========     Host Frame:cudaDeviceSynchronize [0x48874]
=========                in /root/bobc/./t138
=========     Host Frame:main [0xadea]
=========                in /root/bobc/./t138
=========     Host Frame: [0x29d90]
=========                in /lib/x86_64-linux-gnu/libc.so.6
=========     Host Frame:__libc_start_main [0x29e40]
=========                in /lib/x86_64-linux-gnu/libc.so.6
=========     Host Frame:_start [0xaad5]
=========                in /root/bobc/./t138
=========
========= ERROR SUMMARY: 2 errors
# cuobjdump -sass ./t138

Fatbin elf code:
================
arch = sm_52
code version = [1,7]
host = linux
compile_size = 64bit

        code for sm_52
                Function : _Z3fooRi
        .headerflags    @"EF_CUDA_TEXMODE_UNIFIED EF_CUDA_64BIT_ADDRESS EF_CUDA_SM52 EF_CUDA_VIRTUAL_SM(EF_CUDA_SM52)"
                                                                  /* 0x001ec800fda207f6 */
        /*0008*/                   LEA R6.CC, R4.reuse, RZ ;      /* 0x5bd780000ff70406 */
        /*0010*/                   LEA.HI.X P0, R7, R4, RZ, R5 ;  /* 0x5bd802c00ff70407 */
        /*0018*/                   LD.E R0, [R6], P0 ;            /* 0x8090000000070600 */
                                                                  /* 0x003ffc001fa107f4 */
        /*0028*/                   IADD32I R0, R0, 0xd ;          /* 0x1c00000000d70000 */
        /*0030*/                   ST.E [R6], R0, P0 ;            /* 0xa090000000070600 */
        /*0038*/                   RET ;                          /* 0xe32000000007000f */
                                                                  /* 0x001f8000fc0007ff */
        /*0048*/                   BRA 0x40 ;                     /* 0xe2400fffff07000f */
        /*0050*/                   NOP;                           /* 0x50b0000000070f00 */
        /*0058*/                   NOP;                           /* 0x50b0000000070f00 */
                                                                  /* 0x001f8000fc0007e0 */
        /*0068*/                   NOP;                           /* 0x50b0000000070f00 */
        /*0070*/                   NOP;                           /* 0x50b0000000070f00 */
        /*0078*/                   NOP;                           /* 0x50b0000000070f00 */
                ..........


                Function : _Z1ki
        .headerflags    @"EF_CUDA_TEXMODE_UNIFIED EF_CUDA_64BIT_ADDRESS EF_CUDA_SM52 EF_CUDA_VIRTUAL_SM(EF_CUDA_SM52)"
                                                                  /* 0x001fc800fe2007f6 */
        /*0008*/                   MOV R1, c[0x0][0x20] ;         /* 0x4c98078000870001 */
        /*0010*/                   MOV32I R18, 0x0 ;              /* 0x010000000007f012 */
        /*0018*/                   MOV32I R19, 0x0 ;              /* 0x010000000007f013 */
                                                                  /* 0x0c1fd800f64007f0 */
        /*0028*/                   IADD32I R1, R1, -0x10 ;        /* 0x1c0fffffff070101 */
        /*0030*/                   LDG.E.64 R2, [R18] ;           /* 0xeed5200000071202 */
        /*0038*/                   LEA R8.CC, R2.reuse, RZ ;      /* 0x5bd780000ff70208 */
                                                                  /* 0x0003e800fc4007f1 */
        /*0048*/                   LEA.HI.X P0, R9, R2, RZ, R3 ;  /* 0x5bd801c00ff70209 */
        /*0050*/                   MOV R3, c[0x0][0x140] ;        /* 0x4c98078005070003 */
        /*0058*/                   STL [R1], R3 ;                 /* 0xef54000000070103 */
                                                                  /* 0x001fc400fe2007b1 */
        /*0068*/                   LD.E.U8 R0, [R8], P0 ;         /* 0x8010000000070800 */
        /*0070*/                   LOP.OR R2, R1, c[0x0][0x4] ;   /* 0x4c47020000170102 */
        /*0078*/                   MOV32I R4, 0x0 ;               /* 0x010000000007f004 */
                                                                  /* 0x001fc400fcc007f4 */
        /*0088*/                   MOV32I R5, 0x0 ;               /* 0x010000000007f005 */
        /*0090*/                   IADD32I R17.CC, R2, 0x8 ;      /* 0x1c10000000870211 */
        /*0098*/                   IADD.X R16, RZ, RZ ;           /* 0x5c1008000ff7ff10 */
                                                                  /* 0x0403d400fe2007f5 */
        /*00a8*/                   MOV R6, R17 ;                  /* 0x5c98078001170006 */
        /*00b0*/                   MOV R7, R16 ;                  /* 0x5c98078001070007 */
        /*00b8*/                   STL [R1+0x8], R0 ;             /* 0xef54000000870100 */
                                                                  /* 0x001fc400fe200ffd */
        /*00c8*/                   JCAL 0x0 ;                     /* 0xe220000000000040 */
        /*00d0*/                   MOV32I R4, 0x0 ;               /* 0x010000000007f004 */
        /*00d8*/                   MOV32I R5, 0x0 ;               /* 0x010000000007f005 */
                                                                  /* 0x001ff400fec007f1 */
        /*00e8*/                   MOV R6, RZ ;                   /* 0x5c9807800ff70006 */
        /*00f0*/                   MOV R7, RZ ;                   /* 0x5c9807800ff70007 */
        /*00f8*/                   JCAL 0x0 ;                     /* 0xe220000000000040 */
                                                                  /* 0x001fd800162007f0 */
        /*0108*/                   MOV R4, R2 ;                   /* 0x5c98078000270004 */
        /*0110*/                   LDG.E R18, [R18] ;             /* 0xeed4200000071212 */
        /*0118*/                   MOV R5, RZ ;                   /* 0x5c9807800ff70005 */
                                                                  /* 0x001fc021ffa007f1 */
        /*0128*/                   PRET 0x138 ;                   /* 0xe270000000800040 */
        /*0130*/                   JMX R18 ;                      /* 0xe20000000007120f */
        /*0138*/                   MOV R6, R17 ;                  /* 0x5c98078001170006 */
                                                                  /* 0x001fc400fe2007b1 */
        /*0148*/                   LDL R0, [R1] ;                 /* 0xef44000000070100 */
        /*0150*/                   MOV R7, R16 ;                  /* 0x5c98078001070007 */
        /*0158*/                   MOV32I R4, 0x0 ;               /* 0x010000000007f004 */
                                                                  /* 0x003ff4201ea007f1 */
        /*0168*/                   MOV32I R5, 0x0 ;               /* 0x010000000007f005 */
        /*0170*/                   STL [R1+0x8], R0 ;             /* 0xef54000000870100 */
        /*0178*/                   JCAL 0x0 ;                     /* 0xe220000000000040 */
                                                                  /* 0x001f8000ffe007ff */
        /*0188*/                   EXIT ;                         /* 0xe30000000007000f */
        /*0190*/                   BRA 0x190 ;                    /* 0xe2400fffff87000f */
        /*0198*/                   NOP;                           /* 0x50b0000000070f00 */
                                                                  /* 0x001f8000fc0007e0 */
        /*01a8*/                   NOP;                           /* 0x50b0000000070f00 */
        /*01b0*/                   NOP;                           /* 0x50b0000000070f00 */
        /*01b8*/                   NOP;                           /* 0x50b0000000070f00 */
                ..........

It’s also not obvious to me that one SASS instruction universally has a size of 16 bytes, although I acknowledge it might be possible to express every SASS instruction in a 16-byte field.

1 Like