Keep PTX with CUDA_SEPARABLE_COMPILATION

Hello,
I am currently trying to fix an compatibility issue with the Blackwell series.
According to the Blackwell Compatibility Guide one should use CUDA_FORCE_PTX_JIT=1 to verify forward compatibility of binaries created with older versions of the CUDA Toolkit.

My binary fails this check. (Interestingly, the check fails on Linux and Windows, but the application works fine on Linux.)

Now I am stuck trying to find out why the PTX was removed in the first place.
The project uses CMake and it seems that CUDA_SEPARABLE_COMPILATION ON is part of the problem.
I managed to reproduce the issue with a small sample.
When I switch from OFF to ONthe output of cuobjdump reduces from

Fatbin ptx code:
================
arch = sm_52
code version = [8,0]
host = linux
compile_size = 64bit
compressed

Fatbin elf code:
================
arch = sm_52
code version = [1,7]

host = linux
compile_size = 64bit

Fatbin elf code:
================
arch = sm_52
code version = [1,7]
host = linux
compile_size = 64bit

to just

Fatbin elf code:
================
arch = sm_52
code version = [1,7]
host = linux
compile_size = 64bit


The link command seems to be identical for both versions
dlink.txt:

nvcc -forward-unknown-to-host-compiler -ccbin=g++-11.3 -O3 -DNDEBUG "--generate-code=arch=compute_52,code=[compute_52,sm_52]" -Xcompiler=-fvisibility=hidden -Xcompiler=-fPIC -Wno-deprecated-gpu-targets -shared -dlink --options-file CMakeFiles/PtxIssue.dir/deviceObjects1.rsp -o CMakeFiles/PtxIssue.dir/cmake_device_link.o --options-file CMakeFiles/PtxIssue.dir/deviceLinkLibs.rsp

link.txt:

g++-11.3 @CMakeFiles/PtxIssue.dir/objects1.rsp CMakeFiles/PtxIssue.dir/cmake_device_link.o -o PtxIssue @CMakeFiles/PtxIssue.dir/linkLibs.rsp -L"<ToolkitInstallDirectory>/NVIDIA-CUDA-12.0-Toolkit/targets/x86_64-linux/lib/stubs" -L"<ToolkitInstallDirectory>/NVIDIA-CUDA-12.0-Toolkit/targets/x86_64-linux/lib"

I also attached the compile logs
CUDA_SEPARABLE_COMPILATION.ON.txt (3.8 KB)
CUDA_SEPARABLE_COMPILATION.OFF.txt (3.7 KB)


Sample Code

CMakeLists.txt

cmake_minimum_required(VERSION 3.23)
project(PtxIssue LANGUAGES C CXX CUDA)
find_package(CUDAToolkit 12.0 EXACT REQUIRED)
add_executable(PtxIssue
        main.cu
)
set_target_properties(PtxIssue
        PROPERTIES
        CUDA_SEPARABLE_COMPILATION ON
        POSITION_INDEPENDENT_CODE ON
        CUDA_RESOLVE_DEVICE_SYMBOLS ON
        CUDA_RUNTIME_LIBRARY static
)
target_compile_options(PtxIssue PRIVATE
        $<$<COMPILE_LANGUAGE:CUDA>:-v>
        $<$<COMPILE_LANGUAGE:CUDA>:--keep>
)

main.cu

#include <iostream>
#include <stdexcept>
#include <string>

#include <cub/cub.cuh>
#include <cuda_runtime_api.h>

inline void
checkResult(cudaError_t result, std::string const& errorMessage)
{
    if (result != cudaSuccess) {
        throw std::runtime_error(errorMessage + ": (" + std::to_string(result) + ") " + cudaGetErrorString(result));
    }
}

class SegmentedSort {
private:
    int nValuesMax;
    int nSegmentsMax;

    struct Internals {
        void*  storage       = nullptr;
        size_t storageSize   = 0;
        float* keySequence   = nullptr;
        int*   valueSequence = nullptr;
    } internal;

public:
    SegmentedSort(int nValuesMax, int nSegmentsMax) : nValuesMax(nValuesMax), nSegmentsMax(nSegmentsMax)
    {
        checkResult(cudaMalloc(&internal.keySequence, nValuesMax * sizeof(float)),
            "Failed to allocate memory for keySequenceInternal");
        checkResult(cudaMalloc(&internal.valueSequence, nValuesMax * sizeof(int)),
            "Failed to allocate memory for valueSequenceInternal");
        checkResult(cub::DeviceSegmentedSort::SortPairs<float, int, int const*, int const*>(nullptr,
                        internal.storageSize, nullptr, nullptr, nullptr, nullptr, nValuesMax, nSegmentsMax, nullptr,
                        nullptr),
            "Failed to determine temporary storage for segmented sort");
        checkResult(cudaMalloc(&internal.storage, internal.storageSize),
            "Failed to allocate memory for temporary storage");
    }

    ~SegmentedSort()
    {
        if (internal.keySequence) {
            checkResult(cudaFree(internal.keySequence), "Failed to free key sequence");
        }
        if (internal.valueSequence) {
            checkResult(cudaFree(internal.valueSequence), "Failed to free value sequence");
        }
        if (internal.storage) {
            checkResult(cudaFree(internal.storage), "Failed to free storage");
        }
    }

    void
    operator()(float* keys, int* values, int count, int const* segmentOffsets, int segmentCount)
    {
        if (count > nValuesMax) {
            throw std::runtime_error("Number of key/values is too large");
        }
        if (segmentCount > nSegmentsMax) {
            throw std::runtime_error("Number of segments is too large");
        }

        checkResult(cub::DeviceSegmentedSort::SortPairs(internal.storage, internal.storageSize, keys,
                        internal.keySequence, values, internal.valueSequence, count, segmentCount, segmentOffsets,
                        segmentOffsets + 1),
            "Failed to perform segmented sort");

        checkResult(cudaMemcpy(keys, internal.keySequence, count * sizeof(float), cudaMemcpyDeviceToDevice),
            "Failed to copy sorted keys back to original array");
        checkResult(cudaMemcpy(values, internal.valueSequence, count * sizeof(int), cudaMemcpyDeviceToDevice),
            "Failed to copy sorted values back to original array");
    }
};

int main()
try {
    SegmentedSort sorter(1000000, 1000);

    float* keys           = nullptr;
    int*   values         = nullptr;
    int*   segmentOffsets = nullptr;
    int    count          = 10000;
    int    segmentCount   = 2;

    // Allocate memory for keys, values & segment offsets
    checkResult(cudaMalloc(&keys, count * sizeof(float)), "Failed to allocate keys");
    checkResult(cudaMalloc(&values, count * sizeof(int)), "Failed to allocate values");
    checkResult(cudaMalloc(&segmentOffsets, (segmentCount + 1) * sizeof(int)), "Failed to allocate segmentOffsets");

    int offsets[] = {0, 5000, 10000};
    checkResult(cudaMemcpy(segmentOffsets, offsets, (segmentCount + 1) * sizeof(int), cudaMemcpyHostToDevice),
        "Failed to copy segment offsets");

    // Perform sorting
    sorter(keys, values, count, segmentOffsets, segmentCount);

    // Free allocated memory
    checkResult(cudaFree(keys), "Failed to free keys");
    checkResult(cudaFree(values), "Failed to free values");
    checkResult(cudaFree(segmentOffsets), "Failed to free segment offsets");

    std::cout << "Success" << std::endl;
} catch (std::exception const& e) {
    std::cerr << "Error: " << e.what() << std::endl;
}

Many thanks in advance and kind regards
Jonas

In my own project, the device link step appears to remove the embedded PTX code. It was verified with cuobjdump that the testobject.o still contained embedded PTX.


Fatbin elf code:
================
arch = sm_70
code version = [1,7]
host = linux
compile_size = 64bit
compressed

Fatbin ptx code:
================
arch = sm_70
code version = [8,2]
host = linux
compile_size = 64bit
compressed
ptxasOptions = --compile-only
nvcc -arch=sm_70 -rdc=true -dlink testobject.o -o dlink_o.o

The dlink_o.o module then reports this:

Fatbin elf code:
================
arch = sm_70
code version = [1,7]
host = linux
compile_size = 64bit
compressed

The same happens with -gencode arch=compute_70,code=sm_70

Thank you for further investigating this.

Does this mean that relocatable device code and forward compatibility through embedded PTX are mutually exclusive?

They are not mutually exclusive. You can prove this to yourself with a simple test.

One of the aspects of separable compilation that you are exploring here is device-linking of code. Device-linking of code takes place during the conversion from PTX to SASS. Therefore, if you specify device-linking, you are necessarily specifying some form of SASS output. That doesn’t necessarily mean that the PTX disappears from your fatbinary, but it does seem to mean that the PTX after that step is no longer in a state that is readily dump-able by cuobjdump. However by specifying cuobjdump -all (on the fatbinary, not on an intermediate linked object) it is evident that there are additional sections in the fatbinary that we don’t have full visibility into.

The most likely conclusion, then, is that the device-linking step must be performed “again” by the GPU driver, when PTX code for a previous architecture is run on a newer architecture. This facility in the GPU driver is sometimes generically referred to as CUDA JIT Link.

I won’t be able to help much with CMake.

Here is a simple test indicating PTX forward compatibility is not incompatible with separable compilation. It’s being run a cc8.9 device (CUDA 12.2):

# cat t1.cu
__device__ int foo(int d) {return (d * 2);}
# cat t2.cu
#include <cstdio>
__device__ int foo(int);
__global__ void k(int c){printf("%d\n", foo(c));}


int main(){
  k<<<1,1>>>(3);
  cudaDeviceSynchronize();
}

# nvcc -rdc=true -o test t1.cu t2.cu -arch=sm_70
# compute-sanitizer ./test
========= COMPUTE-SANITIZER
6
========= ERROR SUMMARY: 0 errors
# cuobjdump -all ./test

Fatbin elf code:
================
arch = sm_70
code version = [1,7]
host = linux
compile_size = 64bit

Fatbin elf code:
================
arch = sm_70
code version = [1,7]
host = linux
compile_size = 64bit
compressed

Fatbin ptx code:
================
arch = sm_70
code version = [8,2]
host = linux
compile_size = 64bit
compressed
ptxasOptions = --compile-only

Fatbin elf code:
================
arch = sm_70
code version = [1,7]
host = linux
compile_size = 64bit
compressed

Fatbin ptx code:
================
arch = sm_70
code version = [8,2]
host = linux
compile_size = 64bit
compressed
ptxasOptions = --compile-only
#

(CUDA_FORCE_PTX_JIT=1 ./test also works as expected.)

FWIW, when I compile and run your posted main.cu code as my t3.cu in a similar fashion, on the same setup, I observe some differences:

# nvcc -o t3 t3.cu -rdc=true -arch=sm_70
# compute-sanitizer ./t3
========= COMPUTE-SANITIZER
Success
========= ERROR SUMMARY: 0 errors
# CUDA_FORCE_PTX_JIT=1 ./t3
Error: Failed to determine temporary storage for segmented sort: (300) device kernel image is invalid
#

I have not run into that before and don’t have an immediate explanation, except to say it may be related to cub usage, since that particular issue does not show up in my simple test case. You could probably file a bug for:

  1. cuobjdump not showing PTX when relocatable device code specified (perhaps/arguably an RFE since the current behavior is probably intended/expected, I would guess) see later note below
  2. your test case failing with FORCE PTX JIT

Later: Some of my colleagues informed me that specifying cuobjdump -all -ptx will cause the PTX to be dumped even when specifying relocatable device code/device-linking. It worked for my test case.

Bring back NVBUG ticket ID 5520883 update:

Here is the explanation to the behavior .
In your this bug Description case t3.cu , when -rdc=true (CUDA_SEPARABLE_COMPILATION ON) is specified , CUB will by default enable CUDA CDP which replies on and links against cuda lib libcudadevrt.a . In most of our libs , we ship the highest arch ptx which can be told by applying cuobjdump on libs . This ptx will ensure the compatibility of libs to future newer arches via driver JIT .
When CUDA_FORCE_PTX_JIT=1 is specified, all embedded cubin in libs will be disabled and force everything to compile from PTX . The embedded sm_120 ptx (latest arch so far to CUDA 13.0 ) fails to JIT for your compute 70 code . The app built by any CUDA toolkit version released as of sm_70 support will fail for this check .
When we disable CDP in CUB , we can see it passes well on machine sm_70

$ nvcc -o test test.cu -rdc=true -arch=sm_70 -DCUB_DISABLE_CDP=1
$ CUDA_FORCE_PTX_JIT=1 ./test
Success
$ 

Is this expected behavior and what is the confusion ?
Yes , this is expected behavior . The confusion here is , CUDA_FORCE_PTX_JIT=1 is suggested to check app forward compatibility on newer GPU arches instead for same GPU machine or newer driver/CUDA toolkit . For example of your Forum initial described case main.cu , let’s do this step by step.

  1. Suppose app main.cu is built against CUDA 12.0 , with sm_52 ptx on a Turing GPU machine same to what you pointed in the Forum .
    This shows same failure to you .
yni@node2:~/yni/Customerbug/5520883$ /usr/local/cuda-12.0/bin/nvcc --generate-code=arch=compute_52,code=[compute_52,sm_52] -rdc=true main.cu -ccbin=g++-11
yni@node2:~/yni/Customerbug/5520883$ ./a.out 
Success
yni@node2:~/yni/Customerbug/5520883$ CUDA_FORCE_PTX_JIT=1 ./a.out 
Error: Failed to determine temporary storage for segmented sort: (300) device kernel image is invalid
yni@node2:~/yni/Customerbug/5520883$ 

Because there is sm_90 ptx which cannot JIT to compute_75(Turing) code .

yni@node2:~/yni/Customerbug/5520883$ /usr/local/cuda-12.0/bin/cuobjdump -all ./a.out | grep ptx -A 2
Fatbin ptx code:
================
arch = sm_52
--
ptxasOptions = --compile-only  

Fatbin elf code:
--
Fatbin ptx code:
================
arch = sm_90
--
ptxasOptions = --compile-only  -maxrregcount=32 

Fatbin elf code:
yni@node2:~/yni/Customerbug/5520883$ 
yni@node2:~/yni/Customerbug/5520883$ /usr/local/cuda-12.0/bin/__nvcc_device_query 
75
yni@node2:~/yni/Customerbug/5520883$ 
  1. Now , we have a new Blackwell GPU and would like to check if our built app can run on this new GPU
    Copy over binary directly . We can see the app can pass this check successfully because all embedded ptx (sm_52 & sm_90) can JIT to local 120 compute code .
local-yni@1u1g-b650-0080:~/yni/5520883$ ./a.out 
Success
local-yni@1u1g-b650-0080:~/yni/5520883$ CUDA_FORCE_PTX_JIT=1 ./a.out 
Success
local-yni@1u1g-b650-0080:~/yni/5520883$ /usr/local/cuda-13.0/bin/__nvcc_device_query 
120
local-yni@1u1g-b650-0080:~/yni/5520883$ 

Hope this explains the behavior to you . That said , your initial case should pass this CUDA_FORCE_PTX_JIT=1 test on a blackwell GPU if your app is built against 12.0 and all kernels contain embedded ptx .