Hello,
I am currently trying to fix an compatibility issue with the Blackwell series.
According to the Blackwell Compatibility Guide one should use CUDA_FORCE_PTX_JIT=1 to verify forward compatibility of binaries created with older versions of the CUDA Toolkit.
My binary fails this check. (Interestingly, the check fails on Linux and Windows, but the application works fine on Linux.)
Now I am stuck trying to find out why the PTX was removed in the first place.
The project uses CMake and it seems that CUDA_SEPARABLE_COMPILATION ON is part of the problem.
I managed to reproduce the issue with a small sample.
When I switch from OFF to ONthe output of cuobjdump reduces from
Fatbin ptx code:
================
arch = sm_52
code version = [8,0]
host = linux
compile_size = 64bit
compressed
Fatbin elf code:
================
arch = sm_52
code version = [1,7]
host = linux
compile_size = 64bit
Fatbin elf code:
================
arch = sm_52
code version = [1,7]
host = linux
compile_size = 64bit
to just
Fatbin elf code:
================
arch = sm_52
code version = [1,7]
host = linux
compile_size = 64bit
The link command seems to be identical for both versions
dlink.txt:
nvcc -forward-unknown-to-host-compiler -ccbin=g++-11.3 -O3 -DNDEBUG "--generate-code=arch=compute_52,code=[compute_52,sm_52]" -Xcompiler=-fvisibility=hidden -Xcompiler=-fPIC -Wno-deprecated-gpu-targets -shared -dlink --options-file CMakeFiles/PtxIssue.dir/deviceObjects1.rsp -o CMakeFiles/PtxIssue.dir/cmake_device_link.o --options-file CMakeFiles/PtxIssue.dir/deviceLinkLibs.rsp
link.txt:
g++-11.3 @CMakeFiles/PtxIssue.dir/objects1.rsp CMakeFiles/PtxIssue.dir/cmake_device_link.o -o PtxIssue @CMakeFiles/PtxIssue.dir/linkLibs.rsp -L"<ToolkitInstallDirectory>/NVIDIA-CUDA-12.0-Toolkit/targets/x86_64-linux/lib/stubs" -L"<ToolkitInstallDirectory>/NVIDIA-CUDA-12.0-Toolkit/targets/x86_64-linux/lib"
I also attached the compile logs
CUDA_SEPARABLE_COMPILATION.ON.txt (3.8 KB)
CUDA_SEPARABLE_COMPILATION.OFF.txt (3.7 KB)
Sample Code
CMakeLists.txt
cmake_minimum_required(VERSION 3.23)
project(PtxIssue LANGUAGES C CXX CUDA)
find_package(CUDAToolkit 12.0 EXACT REQUIRED)
add_executable(PtxIssue
main.cu
)
set_target_properties(PtxIssue
PROPERTIES
CUDA_SEPARABLE_COMPILATION ON
POSITION_INDEPENDENT_CODE ON
CUDA_RESOLVE_DEVICE_SYMBOLS ON
CUDA_RUNTIME_LIBRARY static
)
target_compile_options(PtxIssue PRIVATE
$<$<COMPILE_LANGUAGE:CUDA>:-v>
$<$<COMPILE_LANGUAGE:CUDA>:--keep>
)
main.cu
#include <iostream>
#include <stdexcept>
#include <string>
#include <cub/cub.cuh>
#include <cuda_runtime_api.h>
inline void
checkResult(cudaError_t result, std::string const& errorMessage)
{
if (result != cudaSuccess) {
throw std::runtime_error(errorMessage + ": (" + std::to_string(result) + ") " + cudaGetErrorString(result));
}
}
class SegmentedSort {
private:
int nValuesMax;
int nSegmentsMax;
struct Internals {
void* storage = nullptr;
size_t storageSize = 0;
float* keySequence = nullptr;
int* valueSequence = nullptr;
} internal;
public:
SegmentedSort(int nValuesMax, int nSegmentsMax) : nValuesMax(nValuesMax), nSegmentsMax(nSegmentsMax)
{
checkResult(cudaMalloc(&internal.keySequence, nValuesMax * sizeof(float)),
"Failed to allocate memory for keySequenceInternal");
checkResult(cudaMalloc(&internal.valueSequence, nValuesMax * sizeof(int)),
"Failed to allocate memory for valueSequenceInternal");
checkResult(cub::DeviceSegmentedSort::SortPairs<float, int, int const*, int const*>(nullptr,
internal.storageSize, nullptr, nullptr, nullptr, nullptr, nValuesMax, nSegmentsMax, nullptr,
nullptr),
"Failed to determine temporary storage for segmented sort");
checkResult(cudaMalloc(&internal.storage, internal.storageSize),
"Failed to allocate memory for temporary storage");
}
~SegmentedSort()
{
if (internal.keySequence) {
checkResult(cudaFree(internal.keySequence), "Failed to free key sequence");
}
if (internal.valueSequence) {
checkResult(cudaFree(internal.valueSequence), "Failed to free value sequence");
}
if (internal.storage) {
checkResult(cudaFree(internal.storage), "Failed to free storage");
}
}
void
operator()(float* keys, int* values, int count, int const* segmentOffsets, int segmentCount)
{
if (count > nValuesMax) {
throw std::runtime_error("Number of key/values is too large");
}
if (segmentCount > nSegmentsMax) {
throw std::runtime_error("Number of segments is too large");
}
checkResult(cub::DeviceSegmentedSort::SortPairs(internal.storage, internal.storageSize, keys,
internal.keySequence, values, internal.valueSequence, count, segmentCount, segmentOffsets,
segmentOffsets + 1),
"Failed to perform segmented sort");
checkResult(cudaMemcpy(keys, internal.keySequence, count * sizeof(float), cudaMemcpyDeviceToDevice),
"Failed to copy sorted keys back to original array");
checkResult(cudaMemcpy(values, internal.valueSequence, count * sizeof(int), cudaMemcpyDeviceToDevice),
"Failed to copy sorted values back to original array");
}
};
int main()
try {
SegmentedSort sorter(1000000, 1000);
float* keys = nullptr;
int* values = nullptr;
int* segmentOffsets = nullptr;
int count = 10000;
int segmentCount = 2;
// Allocate memory for keys, values & segment offsets
checkResult(cudaMalloc(&keys, count * sizeof(float)), "Failed to allocate keys");
checkResult(cudaMalloc(&values, count * sizeof(int)), "Failed to allocate values");
checkResult(cudaMalloc(&segmentOffsets, (segmentCount + 1) * sizeof(int)), "Failed to allocate segmentOffsets");
int offsets[] = {0, 5000, 10000};
checkResult(cudaMemcpy(segmentOffsets, offsets, (segmentCount + 1) * sizeof(int), cudaMemcpyHostToDevice),
"Failed to copy segment offsets");
// Perform sorting
sorter(keys, values, count, segmentOffsets, segmentCount);
// Free allocated memory
checkResult(cudaFree(keys), "Failed to free keys");
checkResult(cudaFree(values), "Failed to free values");
checkResult(cudaFree(segmentOffsets), "Failed to free segment offsets");
std::cout << "Success" << std::endl;
} catch (std::exception const& e) {
std::cerr << "Error: " << e.what() << std::endl;
}
Many thanks in advance and kind regards
Jonas