Description
TensorRT consistently crashes during TensorRT engine generation in some particular circumstances. Testing shows the following:
- Not all GPUs are affected. For example, T4 is affected, A10 and RTX2070 are not.
- Only a subset of networks (we use ONNX format) is affected.
- No problems when FP16 is disabled.
- No problems when using primary CUDA contexts from CUDART (
cudaSetDevice
and such).
The following logs passed via ILogger precede the crash:
{ForeignNode[model.pos_embed.../embedding/embedding.3/Add_1]} (Myelin[0x80000023]) profiling completed in 14.1957 seconds. Fastest Tactic: 0x0000000000000000 Time: 46.116
>>>>>>>>>>>>>>> Chose Runner Type: Myelin Tactic: 0x0000000000000000
*************** Autotuning format combination: Half(200704,196,14,1) -> Half(512,1) ***************
--------------- Timing Runner: {ForeignNode[model.pos_embed.../embedding/embedding.3/Add_1]} (Myelin[0x80000023])
Segmentation fault
With a primary CUDA context (instead of a custom CUcontext) the corresponding place in logs:
{ForeignNode[model.pos_embed.../embedding/embedding.3/Add_1]} (Myelin[0x80000023]) profiling completed in 11.3497 seconds. Fastest Tactic: 0x0000000000000000 Time: 44.6204
>>>>>>>>>>>>>>> Chose Runner Type: Myelin Tactic: 0x0000000000000000
*************** Autotuning format combination: Half(200704,196,14,1) -> Half(512,1) ***************
--------------- Timing Runner: {ForeignNode[model.pos_embed.../embedding/embedding.3/Add_1]} (Myelin[0x80000023])
(foreignNode) Set user's cuda kernel library
(foreignNode) Autotuning op __mye372061+__mye372049+__mye372037: [ONNX Layer: /blocks/blocks.0/attn/qkv/MatMul]^_[ONNX Layer: /blocks/blocks.0/attn/qkv/Add]
(foreignNode) Sorted table of all evaluated tactics:
(foreignNode) tactic_id, cost(in ms), cost/fastest_cost, prediction_correlation, kernel_name, tactic_hash, tunable_parameter
(foreignNode) 21, 0.1429504, 1.00000, 0.32722, sm75_xmma_gemm_f16f16_f16f16_f16_nn_n_tilesize128x128x32_stage1_warpsize2x2x1_tensor16x8x8, 0x656d80c8951a0,
(foreignNode) 20, 0.1436416, 1.00484, 0.32565, sm75_xmma_gemm_f16f16_f16f32_f32_nn_n_tilesize128x128x32_stage1_warpsize2x2x1_tensor16x8x8, 0x36c42f0898cc7,
(foreignNode) 15, 0.1439072, 1.00669, 0.24993, sm75_xmma_gemm_f16f16_f16f16_f16_tn_n_tilesize128x64x64_stage1_warpsize2x2x1_tensor16x8x8, 0x4a95151ea6a7f,
...
The crash backtrace (shown for static nvinfer libs due to more meaningful output with symbols):
#0 0x0000555556dd54e2 in libnvinfer_23e040836dbc178fb243c1e255ce66b5d4883429 ()
#1 0x0000555557453d0c in libnvinfer_87440430d13387290b97cb05721dd7ad3aad9bdb ()
#2 0x00005555574543e5 in libnvinfer_a3d1c96b39704ca48167498a3629f8b956f554ab ()
#3 0x0000555557454bec in libnvinfer_145775694b771afd7db74fd1691aa856be3b16dd ()
#4 0x00005555572fa0b0 in libnvinfer_0e645f695b3d93db4274ad9bd7f37a72ae41035f ()
#5 0x000055555a0b5eb4 in libnvinfer_8795dfda6f63d5150e8253262fb6dc7c4533fa77 ()
#6 0x0000555558391b7c in libnvinfer_c42578067ba48d457799b72027df8551635807a9 ()
#7 0x0000555558d474e3 in libnvinfer_89d557c50c297c19ed4e10e39fc3e8ac2a335e89 ()
#8 0x0000555558d481f8 in ?? ()
#9 0x0000555558d491f6 in ?? ()
#10 0x00005555583a5327 in libnvinfer_dcecb71fc8c355dc1898f957bf7f17cf2c8bbf73 ()
#11 0x00007ffff61bddb4 in ?? () from /lib/x86_64-linux-gnu/libstdc++.so.6
#12 0x00007ffff5e44aa4 in start_thread (arg=<optimized out>) at ./nptl/pthread_create.c:447
#13 0x00007ffff5ed1c3c in clone3 () at ../sysdeps/unix/sysv/linux/x86_64/clone3.S:78
The crash seems to be caused by a null pointer dereference:
simple[3544395]: segfault at 0 ip 0000562e7574c4e2 sp 00007f55454ab4f0 error 4 in simple[562e74f24000+17b95000]
Environment
TensorRT Version: 10.12.0.36
GPU Type: Nvidia Tesla T4
Nvidia Driver Version: 535.183.01
CUDA Version: 12.2
CUDNN Version: N/A
Operating System + Version: ubuntu 24.04
Steps To Reproduce
The used code is minimized to be quite simple:
#include <cassert>
#include <fstream>
#include <iostream>
#include <memory>
#include <cuda.h>
#include <cuda_runtime.h>
#include <NvInfer.h>
#include <NvOnnxParser.h>
using namespace std;
using namespace nvinfer1;
class InferLogger: public ILogger
{
public:
void log(Severity severity, const char * msg) noexcept override { cerr << msg << '\n'; }
};
int main(int argc, char * argv[])
{
if (argc != 2) { cerr << "expecting one argument: onnx path\n"; return 1; }
ifstream f(argv[1], ios::binary); assert(bool(f));
string onnx_buf((istreambuf_iterator<char>(f)), istreambuf_iterator<char>());
InferLogger logger;
#if defined(CU_DEVICE)
CUresult status;
status = cuInit(0); assert(status == CUDA_SUCCESS);
CUdevice dev;
status = cuDeviceGet(&dev, 2); assert(status == CUDA_SUCCESS);
CUcontext ctx;
status = cuCtxCreate(&ctx, CU_CTX_SCHED_BLOCKING_SYNC, dev); assert(status == CUDA_SUCCESS);
status = cuCtxPushCurrent(ctx); assert(status == CUDA_SUCCESS);
#else
cudaSetDevice(2);
#endif
{
unique_ptr<IBuilder> builder_ptr{createInferBuilder(logger)}; assert(builder_ptr != nullptr);
unique_ptr<IBuilderConfig> cfg_ptr{builder_ptr->createBuilderConfig()}; assert(cfg_ptr != nullptr);
cfg_ptr->setFlag(BuilderFlag::kFP16);
unique_ptr<INetworkDefinition> def_ptr{builder_ptr->createNetworkV2(0U)}; assert(def_ptr != nullptr);
unique_ptr<nvonnxparser::IParser> parser_ptr{nvonnxparser::createParser(*def_ptr, logger)};
bool ok = parser_ptr->parse(onnx_buf.data(), onnx_buf.size()); assert(ok);
unique_ptr<IHostMemory> serialized_engine{builder_ptr->buildSerializedNetwork(*def_ptr, *cfg_ptr)}; assert(serialized_engine != nullptr);
}
#if defined(CU_DEVICE)
cuCtxPopCurrent(nullptr);
cuCtxDestroy(ctx);
#endif
cerr << "Looks OK!\n";
return 0;
}
Environment:
dpkg -i cuda-repo-ubuntu2404-12-9-local_12.9.1-575.57.08-1_amd64.deb
cp /var/cuda-repo-ubuntu2404-12-9-local/cuda-B64325C0-keyring.gpg /usr/share/keyrings/
dpkg -i nv-tensorrt-local-repo-ubuntu2004-10.12.0-cuda-12.9_1.0-1_amd64.deb
cp /var/nv-tensorrt-local-repo-ubuntu2004-10.12.0-cuda-12.9/nv-tensorrt-local-6E2B6563-keyring.gpg /usr/share/keyrings/
apt update
apt install -y libnvinfer-headers-dev libnvinfer10 libnvinfer-headers-plugin-dev libnvonnxparsers-dev libnvonnxparsers10 cuda-cudart-dev-12-9 libcudart12 cuda-crt-12-9
g++ -DCU_DEVICE -ggdb3 -I /usr/local/cuda-12.9/targets/x86_64-linux/include simple.cc -lnvinfer -lnvonnxparser -L /usr/local/cuda-12.9/targets/x86_64-linux/lib -lcudart -lcuda