cudnnCreate block infinitely in my Orin AGX Jetpack 5.2.1

Hi,
I run simple cudnnn test code blow (copy from Minimal cuDNN C++ Example. A minimal 70-line cuDNN example that… | by Rohit Dwivedula | Medium):

#include <iostream>
#include <cuda_runtime.h>
#include <cudnn.h>

/**
 * Minimal example to apply sigmoid activation on a tensor 
 * using cuDNN.
 **/
int main(int argc, char** argv)
{    
    int numGPUs;
    cudaGetDeviceCount(&numGPUs);
    std::cout << "Found " << numGPUs << " GPUs." << std::endl;
    cudaSetDevice(0); // use GPU0
    int device; 
    struct cudaDeviceProp devProp;
    cudaGetDevice(&device);
    cudaGetDeviceProperties(&devProp, device);
    std::cout << "Compute capability:" << devProp.major << "." << devProp.minor << std::endl;

    cudnnHandle_t handle_;
    cudnnCreate(&handle_);
    std::cout << "Created cuDNN handle" << std::endl;

    // create the tensor descriptor
    cudnnDataType_t dtype = CUDNN_DATA_FLOAT;
    cudnnTensorFormat_t format = CUDNN_TENSOR_NCHW;
    int n = 1, c = 1, h = 1, w = 10;
    int NUM_ELEMENTS = n*c*h*w;
    cudnnTensorDescriptor_t x_desc;
    cudnnCreateTensorDescriptor(&x_desc);
    cudnnSetTensor4dDescriptor(x_desc, format, dtype, n, c, h, w);

    // create the tensor
    float *x;
    cudaMallocManaged(&x, NUM_ELEMENTS * sizeof(float));
    for(int i=0;i<NUM_ELEMENTS;i++) x[i] = i * 1.00f;
    std::cout << "Original array: "; 
    for(int i=0;i<NUM_ELEMENTS;i++) std::cout << x[i] << " ";

    // create activation function descriptor
    float alpha[1] = {1};
    float beta[1] = {0.0};
    cudnnActivationDescriptor_t sigmoid_activation;
    cudnnActivationMode_t mode = CUDNN_ACTIVATION_SIGMOID;
    cudnnNanPropagation_t prop = CUDNN_NOT_PROPAGATE_NAN;
    cudnnCreateActivationDescriptor(&sigmoid_activation);
    cudnnSetActivationDescriptor(sigmoid_activation, mode, prop, 0.0f);

    cudnnActivationForward(
        handle_,
        sigmoid_activation,
        alpha,
        x_desc,
        x,
        beta,
        x_desc,
        x
    );

    cudnnDestroy(handle_);
    std::cout << std::endl << "Destroyed cuDNN handle." << std::endl;
    std::cout << "New array: ";
    for(int i=0;i<NUM_ELEMENTS;i++) std::cout << x[i] << " ";
    std::cout << std::endl;
    cudaFree(x);
    return 0;
}

But it block in cudnnCreate infinitely and gdb stack print:

0x0000ffffc5e1fb78 in ?? () from /usr/lib/aarch64-linux-gnu/tegra/libnvidia-ptxjitcompiler.so.1
(gdb) bt
#0  0x0000ffffc5e1fb78 in ?? () from /usr/lib/aarch64-linux-gnu/tegra/libnvidia-ptxjitcompiler.so.1
#1  0x0000ffffc5e6b6fc in ?? () from /usr/lib/aarch64-linux-gnu/tegra/libnvidia-ptxjitcompiler.so.1
#2  0x0000ffffc5e73588 in ?? () from /usr/lib/aarch64-linux-gnu/tegra/libnvidia-ptxjitcompiler.so.1
#3  0x0000ffffc5e737e4 in ?? () from /usr/lib/aarch64-linux-gnu/tegra/libnvidia-ptxjitcompiler.so.1
#4  0x0000ffffc5d925c4 in ?? () from /usr/lib/aarch64-linux-gnu/tegra/libnvidia-ptxjitcompiler.so.1
#5  0x0000ffffc5d92730 in ?? () from /usr/lib/aarch64-linux-gnu/tegra/libnvidia-ptxjitcompiler.so.1
#6  0x0000ffffc5d6f730 in ?? () from /usr/lib/aarch64-linux-gnu/tegra/libnvidia-ptxjitcompiler.so.1
#7  0x0000ffffc5d70024 in ?? () from /usr/lib/aarch64-linux-gnu/tegra/libnvidia-ptxjitcompiler.so.1
#8  0x0000ffffc5f184d8 in ?? () from /usr/lib/aarch64-linux-gnu/tegra/libnvidia-ptxjitcompiler.so.1
#9  0x0000ffffc5f18544 in ?? () from /usr/lib/aarch64-linux-gnu/tegra/libnvidia-ptxjitcompiler.so.1
#10 0x0000ffffc5c49554 in ?? () from /usr/lib/aarch64-linux-gnu/tegra/libnvidia-ptxjitcompiler.so.1
#11 0x0000ffffc5c51134 in ?? () from /usr/lib/aarch64-linux-gnu/tegra/libnvidia-ptxjitcompiler.so.1
#12 0x0000ffffc5c54ab0 in ?? () from /usr/lib/aarch64-linux-gnu/tegra/libnvidia-ptxjitcompiler.so.1
#13 0x0000ffffc5c56080 in ?? () from /usr/lib/aarch64-linux-gnu/tegra/libnvidia-ptxjitcompiler.so.1
#14 0x0000ffffc5c4a840 in __cuda_CallJitEntryPoint () from /usr/lib/aarch64-linux-gnu/tegra/libnvidia-ptxjitcompiler.so.1
#15 0x0000fffff67145f0 in ?? () from /usr/lib/aarch64-linux-gnu/tegra/libcuda.so.1
#16 0x0000fffff6746ccc in ?? () from /usr/lib/aarch64-linux-gnu/tegra/libcuda.so.1
#17 0x0000fffff65bc6b0 in ?? () from /usr/lib/aarch64-linux-gnu/tegra/libcuda.so.1
#18 0x0000fffff654870c in ?? () from /usr/lib/aarch64-linux-gnu/tegra/libcuda.so.1
#19 0x0000fffff6548d5c in ?? () from /usr/lib/aarch64-linux-gnu/tegra/libcuda.so.1
#20 0x0000ffffc87e2e0c in ?? () from /lib/aarch64-linux-gnu/libcudnn_ops_infer.so.8
#21 0x0000ffffc87d3600 in ?? () from /lib/aarch64-linux-gnu/libcudnn_ops_infer.so.8
#22 0x0000ffffc87e9484 in ?? () from /lib/aarch64-linux-gnu/libcudnn_ops_infer.so.8
#23 0x0000ffffc87eaa7c in ?? () from /lib/aarch64-linux-gnu/libcudnn_ops_infer.so.8
#24 0x0000ffffc87eafdc in ?? () from /lib/aarch64-linux-gnu/libcudnn_ops_infer.so.8
#25 0x0000ffffc87e09dc in ?? () from /lib/aarch64-linux-gnu/libcudnn_ops_infer.so.8
#26 0x0000ffffc87c2780 in ?? () from /lib/aarch64-linux-gnu/libcudnn_ops_infer.so.8
#27 0x0000ffffc87f7a2c in ?? () from /lib/aarch64-linux-gnu/libcudnn_ops_infer.so.8
#28 0x0000ffffc71bd090 in cudnnCreate () from /lib/aarch64-linux-gnu/libcudnn_ops_infer.so.8
#29 0x0000aaaaaaaa11b8 in main ()
(gdb) 

Jetpack version: 5.1.2
CUDA version: 11.4
CUDNN version: 8.2.4
I think code should be fine, since it can work fine another board which Jetpack version is 5.1.1 and same CUDA verison.
Can anyone help me? Thanks a lot!
BR/Time

And I tried 11.4.3-cudnn8-runtime-ubuntu20.04 docker container from CUDA | NVIDIA NGC. Still block…

Hi,

Is the other working board also AGX Orin?
Could you share how do you compile the code?

More, 11.4.3-cudnn8-runtime-ubuntu20.04 doesn’t support the Jetson devices, please use the container with the l4t tag instead.
For example, l4t-jetpack:r35.4.1:

Thanks.