CudaAPI calls in functions, compiler/linking bug?

I am currently working on a wrapper around Cuda Texture Object and running into several problems I do not understand.

In the example below if you compile the code with if constexpr (true) set you get the error with some random number

cudaMemcpy2DToArray failed with unrecognized error code 2113869480

When set to false, everything runs fine.
Changing all variables in the function to static or global does not solve the problem.

If I change execute_lerp(table.size()); to execute_lerp(22); a hard-coded number, everything works again in booth options.

What’s going on?

I compile with “CMAKE_POSITION_INDEPENDENT_CODE” enabled.

#include <vector>
#include <iostream>
#include <stdexcept>

#define USE_FUNCTION

__global__ void lerp_test(cudaTextureObject_t tex, const unsigned int N)
{
    int idx = threadIdx.x + blockDim.x * blockIdx.x;
    if (idx >= N)
        return;

    const float tex_coords = (float)idx + 0.5;

    printf("tex_coords: %f\n", tex_coords);
    printf("Tex: %f\n", tex1D<float>(tex, tex_coords));
}

__host__ cudaArray *createArray(const size_t size)
{
    cudaError_t last_error_;

    cudaArray *dArray_;

    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
    last_error_ = cudaMallocArray(&dArray_, &channelDesc, size, 1, cudaArrayDefault);
    if (last_error_ != cudaSuccess)
        throw std::runtime_error("cudaMallocArray failed with " + std::string(cudaGetErrorString(last_error_)));

    return dArray_;
}

__host__ std::vector<double> execute_lerp(size_t N)
{ 
    float src[N];
    for (int i = 0; i < N; i++)
    {
        src[i] = i;
    }

    cudaError_t last_error_;

    cudaArray *dArray_; // = createArray(N);

  #ifdef USE_FUNCTION
        std::cout << "Call Function" << std::endl;
        dArray_ = createArray(sizeof(float) * N);

#else
        std::cout << "Build inplace" << std::endl;
        cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
        last_error_ = cudaMallocArray(&dArray_, &channelDesc, sizeof(float) * N, 1, cudaArrayDefault);
        if (last_error_ != cudaSuccess)
            throw std::runtime_error("cudaMallocArray failed with " + std::string(cudaGetErrorString(last_error_)));

#endif

    cudaMemcpy2DToArray(dArray_, 0, 0, src, N * sizeof(float), N * sizeof(float), 1, cudaMemcpyHostToDevice);
    if (last_error_ != cudaSuccess)
        throw std::runtime_error("cudaMemcpy2DToArray failed with " + std::string(cudaGetErrorString(last_error_)) + " " + std::to_string(last_error_));

    cudaResourceDesc resDesc;
    memset(&resDesc, 0, sizeof(resDesc));
    resDesc.resType = cudaResourceTypeArray;
    resDesc.res.array.array = dArray_;

    cudaTextureDesc texDesc;
    memset(&texDesc, 0, sizeof(texDesc));
    texDesc.readMode = cudaReadModeElementType; // Read data as provided type, no casting
    texDesc.filterMode = cudaFilterModeLinear;
    texDesc.addressMode[0] = cudaAddressModeWrap;
    texDesc.addressMode[1] = cudaAddressModeWrap;
    texDesc.normalizedCoords = 0;

    cudaTextureObject_t textureObject_;

    // create texture object: we only have to do this once!
    last_error_ = cudaCreateTextureObject(&textureObject_, &resDesc, &texDesc, NULL);
    if (last_error_ != cudaSuccess)
        throw std::runtime_error("cudaCreateTextureObject failed with " + std::string(cudaGetErrorString(last_error_)));

    lerp_test<<<(N + 255) / 256, 256>>>(textureObject_, 22);

    auto error = cudaDeviceSynchronize();
    if (error != cudaSuccess)
    {
        std::cout << "Kernel failed: " << cudaGetErrorString(error) << std::endl;
        throw std::runtime_error("Kernel failed");
    }
    return;
}


int main()
{
    std::cout << "Running main" << std::endl;

    std::vector<float> table(22);
    for (int i = 0; i < 22; i++)
    {
        table[i] = i;
    }

    execute_lerp(table.size());

    return 0;
}

I am not allowed to comment on stack overflow (Edit now it works…)

nvcc -forward-unknown-to-host-compiler -rdc=true -g --generate-code=arch=compute_80,code=[compute_80,sm_80] -Xcompiler=-fPIE -std=c++14 -MD -x cu -dc ./check.cu -o ./check.cu.o

nvcc -arch=sm_80 -dlink -o ./check_link.o ./check.cu.o -lcudadevrt -lcudart


g++ -std=c++17 -g -Og ./check_link.o ./check.cu.o -L/usr/local/cuda/targets/x86_64-linux/lib -lcudadevrt -lcudart_static -lrt -lpthread -ldl

I cleared up the example and replaced if constexpr with preprocessor if.

If you define USE_FUNCTION the compiled code does not work, if you undefine USE_FUNCTION the code does work

I don’t seem to have any difficulty with it. I did make a few changes to your code to remove compiler warnings, but I don’t think they are relevant. Here is my test case:

# cat check.cu
#include <vector>
#include <iostream>
#include <stdexcept>

#define USE_FUNCTION

__global__ void lerp_test(cudaTextureObject_t tex, const unsigned int N)
{
    int idx = threadIdx.x + blockDim.x * blockIdx.x;
    if (idx >= N)
        return;

    const float tex_coords = (float)idx + 0.5;

    printf("tex_coords: %f\n", tex_coords);
    printf("Tex: %f\n", tex1D<float>(tex, tex_coords));
}

__host__ cudaArray *createArray(const size_t size)
{
    cudaError_t last_error_;

    cudaArray *dArray_;

    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
    last_error_ = cudaMallocArray(&dArray_, &channelDesc, size, 1, cudaArrayDefault);
    if (last_error_ != cudaSuccess)
        throw std::runtime_error("cudaMallocArray failed with " + std::string(cudaGetErrorString(last_error_)));

    return dArray_;
}

__host__ void execute_lerp(size_t N)
{
    float src[N];
    for (int i = 0; i < N; i++)
    {
        src[i] = i;
    }

    cudaError_t last_error_ = cudaSuccess;

    cudaArray *dArray_; // = createArray(N);

  #ifdef USE_FUNCTION
        std::cout << "Call Function" << std::endl;
        dArray_ = createArray(sizeof(float) * N);

#else
        std::cout << "Build inplace" << std::endl;
        cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
        last_error_ = cudaMallocArray(&dArray_, &channelDesc, sizeof(float) * N, 1, cudaArrayDefault);
        if (last_error_ != cudaSuccess)
            throw std::runtime_error("cudaMallocArray failed with " + std::string(cudaGetErrorString(last_error_)));

#endif

    cudaMemcpy2DToArray(dArray_, 0, 0, src, N * sizeof(float), N * sizeof(float), 1, cudaMemcpyHostToDevice);
    if (last_error_ != cudaSuccess)
        throw std::runtime_error("cudaMemcpy2DToArray failed with " + std::string(cudaGetErrorString(last_error_)) + " " + std::to_string(last_error_));

    cudaResourceDesc resDesc;
    memset(&resDesc, 0, sizeof(resDesc));
    resDesc.resType = cudaResourceTypeArray;
    resDesc.res.array.array = dArray_;

    cudaTextureDesc texDesc;
    memset(&texDesc, 0, sizeof(texDesc));
    texDesc.readMode = cudaReadModeElementType; // Read data as provided type, no casting
    texDesc.filterMode = cudaFilterModeLinear;
    texDesc.addressMode[0] = cudaAddressModeWrap;
    texDesc.addressMode[1] = cudaAddressModeWrap;
    texDesc.normalizedCoords = 0;

    cudaTextureObject_t textureObject_;

    // create texture object: we only have to do this once!
    last_error_ = cudaCreateTextureObject(&textureObject_, &resDesc, &texDesc, NULL);
    if (last_error_ != cudaSuccess)
        throw std::runtime_error("cudaCreateTextureObject failed with " + std::string(cudaGetErrorString(last_error_)));

    lerp_test<<<(N + 255) / 256, 256>>>(textureObject_, 22);

    auto error = cudaDeviceSynchronize();
    if (error != cudaSuccess)
    {
        std::cout << "Kernel failed: " << cudaGetErrorString(error) << std::endl;
        throw std::runtime_error("Kernel failed");
    }
    return;
}


int main()
{
    std::cout << "Running main" << std::endl;

    std::vector<float> table(22);
    for (int i = 0; i < 22; i++)
    {
        table[i] = i;
    }

    execute_lerp(table.size());

    return 0;
}
# nvcc -forward-unknown-to-host-compiler -rdc=true -g --generate-code=arch=compute_80,code=[compute_80,sm_80] -Xcompiler=-fPIE -std=c++14 -MD -x cu -dc ./check.cu -o ./check.cu.o
# nvcc -arch=sm_80 -dlink -o ./check_link.o ./check.cu.o -lcudadevrt -lcudart
# g++ -std=c++17 -g -Og ./check_link.o ./check.cu.o -L/usr/local/cuda/targets/x86_64-linux/lib -lcudadevrt -lcudart_static -lrt -lpthread -ldl
# ./a.out
Running main
Call Function
tex_coords: 0.500000
tex_coords: 1.500000
tex_coords: 2.500000
tex_coords: 3.500000
tex_coords: 4.500000
tex_coords: 5.500000
tex_coords: 6.500000
tex_coords: 7.500000
tex_coords: 8.500000
tex_coords: 9.500000
tex_coords: 10.500000
tex_coords: 11.500000
tex_coords: 12.500000
tex_coords: 13.500000
tex_coords: 14.500000
tex_coords: 15.500000
tex_coords: 16.500000
tex_coords: 17.500000
tex_coords: 18.500000
tex_coords: 19.500000
tex_coords: 20.500000
tex_coords: 21.500000
Tex: 0.000000
Tex: 1.000000
Tex: 2.000000
Tex: 3.000000
Tex: 4.000000
Tex: 5.000000
Tex: 6.000000
Tex: 7.000000
Tex: 8.000000
Tex: 9.000000
Tex: 10.000000
Tex: 11.000000
Tex: 12.000000
Tex: 13.000000
Tex: 14.000000
Tex: 15.000000
Tex: 16.000000
Tex: 17.000000
Tex: 18.000000
Tex: 19.000000
Tex: 20.000000
Tex: 21.000000
#

CUDA 12.2.1, L4

I am running on a DGX A100 Node inside a NGC Cuda container.

Info:

NVIDIA-SMI 470.161.03 Driver Version: 470.161.03 CUDA Version: 12.2

I will try to test on an up-to-date driver version, maybe this can cause the issue.

After upgrading to a node with newer software version installed, it got better.

There still seems to be a problem with linking to catch2::withMain that creates some sort of intermediate issues. In this case: "illegal memory access was encountered” when accessing the texture. It seems it is not correctly bound to the kernel.

If I remove TEST_CASE and replace it with main() everything works as expected.

The same issue exists if I pyind functions utilizing cuda to pybind11 and call them via python.
The kernel returns, with illegal memory access was encountered.

It is likely some sort of CMake issue I am currently running into.

I created an example repository that reproduces the error