Manually perform separate compilation with ptxas and nvlink

I’m trying to JIT PTX code that requires cudadevrt. Regular compilation works fine, as follows:

Starting with some PTX code:

.version 6.3
.target sm_75, debug
.address_size 64

        // .globl       _Z17julia_kernel_8409   // -- Begin function _Z17julia_kernel_8409
                                        // @_Z17julia_kernel_8409
.visible .entry _Z17julia_kernel_8409()
{

Lfunc_begin0:

// %bb.0:                               // %entry
        ret;
Lfunc_end0:
                                        // -- End function
}

Call ptxas --gpu-name sm_75 --verbose --output-file $output_file $input_file:

//--------------------- .text._Z17julia_kernel_8409 --------------------------
        .section        .text._Z17julia_kernel_8409,"ax",@progbits
        .sectioninfo    @"SHI_REGISTERS=4"
        .align  128
        .global         _Z17julia_kernel_8409
        .type           _Z17julia_kernel_8409,@function
        .size           _Z17julia_kernel_8409,(.L_3 - _Z17julia_kernel_8409)
        .other          _Z17julia_kernel_8409,@"STO_CUDA_ENTRY STV_DEFAULT"
_Z17julia_kernel_8438:
.text._Z17julia_kernel_8438:
        /*0000*/                   MOV R1, c[0x0][0x28] ;
.L_2:
        /*0010*/                   MEMBAR.SC.VC ;
        /*0020*/                   ERRBAR ;
        /*0030*/                   EXIT ;
.L_1:
        /*0070*/                   BRA `(.L_1);
.L_3:

I then load that code using cuModuleLoadDataEx, which works as expected.

Now I’m trying to add cudadevrt to the mix, and as far as I understand I then need to compile with --compile-only (or I’d get non-relocatable code, and failures due to missing the symbols from the device runtime), which gives me an object I can’t load (as expected), but I can call nvdisasm and it looks fine:

//--------------------- .text._Z17julia_kernel_8409 --------------------------
        .section        .text._Z17julia_kernel_8409,"ax",@progbits
        .sectioninfo    @"SHI_REGISTERS=24"
        .align  128
        .global         _Z17julia_kernel_8409
        .type           _Z17julia_kernel_8409,@function
        .size           _Z17julia_kernel_8409,(.L_33 - _Z17julia_kernel_8409)
        .other          _Z17julia_kernel_8409,@"STO_CUDA_ENTRY STV_DEFAULT"
_Z17julia_kernel_8409:
...

I then call nvlink --arch sm_75 --library /opt/cuda/lib64/libcudadevrt.a --output-file ... $input_file to get a loadable object, but the resulting file is ‘empty’!

        .headerflags    @"EF_CUDA_TEXMODE_UNIFIED EF_CUDA_64BIT_ADDRESS EF_CUDA_SM75 EF_CUDA_VIRTUAL_SM(EF_CUDA_SM75)"
        .elftype        @"ET_EXEC"


//--------------------- .nv.rel.action            --------------------------
        .section        .nv.rel.action,"",@"SHT_CUDA_RELOCINFO"
        .align  8
        .sectionentsize 8
        /*0000*/        .byte   0x4b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x02, 0x08, 0x10, 0x0a, 0x2f, 0x22
        /*0010*/        .byte   0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08, 0x08, 0x00, 0x00, 0x00, 0x00
        /*0020*/        .byte   0x00, 0x00, 0x10, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x18, 0x08, 0x00, 0x00, 0x00, 0x00
        /*0030*/        .byte   0x00, 0x00, 0x20, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x28, 0x08, 0x00, 0x00, 0x00, 0x00
        /*0040*/        .byte   0x00, 0x00, 0x30, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x38, 0x08, 0x00, 0x00, 0x00, 0x00
        /*0050*/        .byte   0x01, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x08, 0x08, 0x00, 0x00, 0x00, 0x00
        /*0060*/        .byte   0x01, 0x00, 0x10, 0x08, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x18, 0x08, 0x00, 0x00, 0x00, 0x00
        /*0070*/        .byte   0x01, 0x00, 0x20, 0x08, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x28, 0x08, 0x00, 0x00, 0x00, 0x00
        /*0080*/        .byte   0x01, 0x00, 0x30, 0x08, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x38, 0x08, 0x00, 0x00, 0x00, 0x00
        /*0090*/        .byte   0x02, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x08, 0x08, 0x00, 0x00, 0x00, 0x00
        /*00a0*/        .byte   0x02, 0x00, 0x10, 0x08, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x18, 0x08, 0x00, 0x00, 0x00, 0x00
        /*00b0*/        .byte   0x02, 0x00, 0x20, 0x08, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x28, 0x08, 0x00, 0x00, 0x00, 0x00
        /*00c0*/        .byte   0x02, 0x00, 0x30, 0x08, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x38, 0x08, 0x00, 0x00, 0x00, 0x00
        /*00d0*/        .byte   0x00, 0x00, 0x00, 0x14, 0x2c, 0x00, 0x00, 0x00

# EOD

Trying to load my kernel then obviously failed with CU_ERROR_NOT_FOUND. What am I missing here? I tried marking the kernel to nvlink with --kernels-used, but that didn’t help.

Note that all this works fine when using the CUDA driver APIs (i.e., using the linker APIs to link cudadevrt and cuModuleLoad to JIT to machine code), but I’m considering switching to calling ptxas and nvlink because it’s easier to upgrade the CUDA toolkit than it is to upgrade the NVIDIA driver (the new static compiler library isn’t an option in my case), and I couldn’t find a way to do LTO using the driver APIs.

Found it: turns out nvlink is sensitive to the input file extension, and making it a .cubin gets me properly linked device code (I also had to switch from --library $path to -L $dir -lcudadevrt).