TensorRT-LLM error msg

denglhs1 · October 31, 2024, 2:46am

Hi, can you anyone please advise what’s this error msg about "

Error processing query: LLM Call Exception: [500] Internal Server Error
Error during inference of request chat-4bd234c02ea240babd0a16fa4ad4543b – Encountered an error in forwardAsync function: [TensorRT-LLM][ERROR] CUDA runtime error in cudaMemcpy2DAsync( dstPtr, copyPitch, srcPtr, copyPitch, copyWidth, copyHeight, cudaMemcpyHostToDevice, cudaStream.get()): unknown error (/home/jenkins/agent/workspace/LLM/release-0.11/L0_MergeRequest/llm/cpp/tensorrt_llm/batch_manager/transformerBuffers.cpp:255)
1 0x7ff0dbe21c2e void tensorrt_llm::common::check(cudaError, char const*, char const*, int) + 94
2 0x7ff0dd050c71 tensorrt_llm::batch_manager::TransformerBuffers::copyKvBlockOffsets(std::vector<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > > const&, std::vector<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > > const&, tensorrt_llm::batch_manager::kv_cache_manager::KVCacheManager const*, tensorrt_llm::batch_manager::kv_cache_manager::KVCacheManager const*, tensorrt_llm::runtime::TllmRuntime const&) + 1281
3 0x7ff0dd048647 tensorrt_llm::batch_manager::RuntimeBuffers::setFromInputs(std::vector<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > > const&, std::vector<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > > const&, int, int, tensorrt_llm::batch_manager::DecoderBuffers&, tensorrt_llm::batch_manager::kv_cache_manager::KVCacheManager*, tensorrt_llm::batch_manager::kv_cache_manager::KVCacheManager*, tensorrt_llm::batch_manager::rnn_state_manager::RnnStateManager*, std::map<unsigned long, std::shared_ptr<std::vector<tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfig, std::allocator<tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfig> > >, std::less, std::allocator<std::pair<unsigned long const, std::shared_ptr<std::vector<tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfig, std::allocator<tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfig> > > > > > const&, tensorrt_llm::runtime::TllmRuntime const&, tensorrt_llm::runtime::ModelConfig const&, tensorrt_llm::runtime::WorldConfig const&) + 9783
4 0x7ff0dd04b612 tensorrt_llm::batch_manager::RuntimeBuffers::prepareStep(std::vector<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > > const&, std::vector<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > > const&, int, int, tensorrt_llm::batch_manager::DecoderBuffers&, tensorrt_llm::batch_manager::kv_cache_manager::KVCacheManager*, tensorrt_llm::batch_manager::kv_cache_manager::KVCacheManager*, tensorrt_llm::batch_manager::rnn_state_manager::RnnStateManager*, std::map<unsigned long, std::shared_ptr<std::vector<tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfig, std::allocator<tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfig> > >, std::less, std::allocator<std::pair<unsigned long const, std::shared_ptr<std::vector<tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfig, std::allocator<tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfig> > > > > > const&, tensorrt_llm::runtime::TllmRuntime const&, tensorrt_llm::runtime::ModelConfig const&, tensorrt_llm::runtime::WorldConfig const&) + 194
5 0x7ff0dd06ace2 tensorrt_llm::batch_manager::TrtGptModelInflightBatching::executeStep(std::vector<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > > const&, std::vector<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > > const&, int) + 162
6 0x7ff0dd06ae96 tensorrt_llm::batch_manager::TrtGptModelInflightBatching::executeBatch(tensorrt_llm::batch_manager::ScheduledRequests const&) + 246
7 0x7ff0dd06b819 tensorrt_llm::batch_manager::TrtGptModelInflightBatching::forwardAsync(std::list<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > > const&) + 2409
8 0x7ff0dd08c19b tensorrt_llm::executor::Executor::Impl::forwardAsync(std::list<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > >&) + 411
9 0x7ff0dd08f9dd tensorrt_llm::executor::Executor::Impl::executionLoop() + 301
10 0x7ff5614f4253 /usr/lib/x86_64-linux-gnu/libstdc++.so.6(+0xdc253) [0x7ff5614f4253]
11 0x7ff563468ac3 /usr/lib/x86_64-linux-gnu/libc.so.6(+0x94ac3) [0x7ff563468ac3]
12 0x7ff5634faa40 /usr/lib/x86_64-linux-gnu/libc.so.6(+0x126a40) [0x7ff5634faa40]
"

neal.vaidya · November 7, 2024, 5:05am

Hi @denglhs1 – is it possible you are running out of memory, either on the GPU or on the host memory? If not, it’s possible that there is an issue with he TensorRT-LLM engine, can you share more details on which model you are running and on what GPU(s)?

Topic		Replies	Views
Nemo Guardrails error msg Models	0	31	November 1, 2024
Ask for help about ERROR cudnnEngine.cpp (56) on TX2 TensorRT	5	731	October 12, 2021
TensorRT 10.8 on Windows: API Usage Error (Target GPU SM 120 is not supported by this TensorRT release.) TensorRT cudnn	3	458	March 27, 2025
CUDA error 700 - an illegal memory access was encountered TensorRT	8	22345	April 12, 2022
Multiple executive warnings after switching tensorflow from 2.16.1 CPU to v60dp tensorflow==2.15.0+nv24.03 GPU version Jetson Orin Nano cudnn	8	2035	May 21, 2024
Cuda Driver (TensorRT internal error) TensorRT	3	137	March 11, 2025
Memory Copy from Device to Host seems blocked or delayed CUDA Programming and Performance tensorrt , cuda , ubuntu	0	141	June 3, 2024
Cuda Error in executeInternal: 700 (an illegal memory access was encountered) Jetson AGX Xavier tensorrt	10	6106	December 2, 2021
../rtSafe/safeRuntime.cpp (25) - Cuda Error in allocate: 2 (out of memory) TensorRT tensorrt , cuda , onnx	3	2809	March 5, 2021
"Failed to get convolution algorithm" problem cuDNN	4	8495	September 7, 2019

TensorRT-LLM error msg

Related topics